Project: Identify Customer Segments¶

a Udacity Data Science Nanodegree project¶

by Zhenhan Huang¶

# import libraries here; add more as necessary
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# magic word for producing visualizations in notebook
%matplotlib inline

Step 0: Load the Data¶

There are four files associated with this project (not including this one):

Udacity_AZDIAS_Subset.csv: Demographics data for the general population of Germany; 891211 persons (rows) x 85 features (columns).
Udacity_CUSTOMERS_Subset.csv: Demographics data for customers of a mail-order company; 191652 persons (rows) x 85 features (columns).
Data_Dictionary.md: Detailed information file about the features in the provided datasets.
AZDIAS_Feature_Summary.csv: Summary of feature attributes for demographics data; 85 features (rows) x 4 columns

# Load in the general demographics data.
azdias = pd.read_csv("Udacity_AZDIAS_Subset.csv", sep=';')

# Load in the feature summary file.
feat_info = pd.read_csv("AZDIAS_Feature_Summary.csv", sep=';')

# Check the structure of the data after it's loaded (e.g. print the number of
# rows and columns, print the first few rows).

# TODO: Total number of rows
print("Total number of rows: " + str(azdias.shape[0]))

# TODO: Total number of columns
print("Total number of columns: " + str(azdias.shape[1]))

# First 5 rows
display(azdias.head(n=5))

Total number of rows: 891221
Total number of columns: 85

#randomly pick 10 features to check
feat_info.sample(n=10)

feat_info[feat_info['attribute'] == 'PRAEGENDE_JUGENDJAHRE']

Tip: Add additional cells to keep everything in reasonably-sized chunks! Keyboard shortcut esc --> a (press escape to enter command mode, then press the 'A' key) adds a new cell before the active cell, and esc --> b adds a new cell after the active cell. If you need to convert an active cell to a markdown cell, use esc --> m and to convert to a code cell, use esc --> y.

Step 1: Preprocessing¶

Step 1.1: Assess Missing Data¶

Step 1.1.1: Convert Missing Value Codes to NaNs¶

# for f in list(azdias.columns):
#     azdias[f][azdias[f].isnull()] = np.nan

# Identify missing or unknown data values and convert them to NaNs.
col_names = azdias.columns.values.tolist()

def cov2NaNs(number):
    x = feat_info.missing_or_unknown[number] #Get the string expression of list
    x = x.replace(" ", "") #remove blank
    l = x[1:-1].split(',') #parse the elements
    for i in l:
        if i.lstrip('-').isdigit(): #if negative digit, convert to int
            azdias.iloc[:,number].replace(int(i), np.nan, inplace = True)
        elif i.isdigit(): #if positive digit, convert to int
            azdias.iloc[:,number].replace(int(i), np.nan, inplace = True)
        else: #For X or XX
            azdias.iloc[:,number].replace(i, np.nan, inplace = True)

for n in range(0,len(feat_info)):
    cov2NaNs(n)

#save the converted df
#azdias.to_csv('new_azdias.csv', encoding='utf-8', index=False)

#check if successfuly converted
print(feat_info.missing_or_unknown[feat_info.attribute =='SEMIO_SOZ'])
print(azdias['SEMIO_SOZ'][azdias['SEMIO_SOZ'] == -1])
print(azdias['SEMIO_SOZ'][azdias['SEMIO_SOZ'] == 9])

24    [-1,9]
Name: missing_or_unknown, dtype: object
Series([], Name: SEMIO_SOZ, dtype: int64)
Series([], Name: SEMIO_SOZ, dtype: int64)

azdias['SEMIO_SOZ']

0         2
1         5
2         4
3         5
4         6
5         2
6         2
7         7
8         4
9         2
10        5
11        2
12        3
13        2
14        2
15        5
16        1
17        2
18        1
19        3
20        2
21        7
22        7
23        4
24        2
25        7
26        3
27        3
28        6
29        5
         ..
891191    2
891192    2
891193    6
891194    6
891195    5
891196    1
891197    1
891198    6
891199    7
891200    5
891201    4
891202    5
891203    2
891204    6
891205    3
891206    1
891207    1
891208    3
891209    5
891210    6
891211    4
891212    6
891213    2
891214    5
891215    2
891216    2
891217    4
891218    5
891219    7
891220    6
Name: SEMIO_SOZ, Length: 891221, dtype: int64

Step 1.1.2: Assess Missing Data in Each Column¶

How much missing data is present in each column?

Identify and document these columns. While some of these columns might have justifications for keeping or re-encoding the data.

For the remaining features, are there any patterns in which columns have, or share, missing data?

# #azdias['AGER_TYP'].isnull()
# print(len(azdias['AGER_TYP'][azdias['AGER_TYP'].isnull()]))
# print(len(pd.DataFrame([1,np.nan,2]).isnull()))
# print(len(azdias))

# Perform an assessment of how much missing data there is in each column of the
# dataset.

# vars_with_missing = []
# missings_cnt = []

# for f in azdias.columns:
#     missings = len(azdias[f][azdias[f].isnull()])
#     if missings > 0:
#         vars_with_missing.append(f)
#         missings_perc = missings/azdias.shape[0]
#         missings_cnt.append(missings)
        
#         print('Variable {} has {} records ({:.2%}) with missing values'.format(f, missings, missings_perc))
        
# print('In total, there are {} variables with missing values'.format(len(vars_with_missing)))

col_with_missing = pd.DataFrame(data={'vars':azdias.columns, 'count_missing': azdias.isnull().sum(axis=0), 'missing_perc (%)': round(azdias.isnull().sum(axis=0)/azdias.shape[0],4)*100})
vars_with_missing = pd.DataFrame(col_with_missing[col_with_missing['count_missing']>0])
vars_with_missing.sort_values(by='count_missing', ascending=False, inplace=True)
vars_with_missing.reset_index(drop =True, inplace =True)
pd.options.display.max_rows = 61
print('In total, there are {} variables with missing values'.format(len(vars_with_missing)))
print('Columns which share the same proportion of missing values can be checked from the table below')
print('''e.g. columns share 14.96% missingess:
KBA05_ANTG3
KBA05_ANTG1
KBA05_ANTG2
KBA05_GBZ
KBA05_ANTG4
MOBI_REGIO''')

display(vars_with_missing)
pd.reset_option("display.max_rows")

In total, there are 61 variables with missing values
Columns which share the same proportion of missing values can be checked from the table below
e.g. columns share 14.96% missingess:
KBA05_ANTG3
KBA05_ANTG1
KBA05_ANTG2
KBA05_GBZ
KBA05_ANTG4
MOBI_REGIO

# Investigate patterns in the amount of missing data in each column.
col_missing = pd.DataFrame(data={'vars': list(vars_with_missing['vars']), 'count': list(vars_with_missing['count_missing'])})
vars_with_missing = list(vars_with_missing['vars'])
ax = col_missing.plot.bar(x='vars', y='count', rot=90, figsize=(25,5))

#visualize the location of missing values (for all features which have missing values)
fig = plt.subplots(figsize=(15,10)) 
sns.heatmap(azdias[vars_with_missing].isnull())

<matplotlib.axes._subplots.AxesSubplot at 0x10bee1668>

# Remove the outlier columns from the dataset. (You'll perform other data
# engineering tasks such as re-encoding and imputation later.)

#Remove columns with more than 200000 rows of missing values (these colums also have many countious rows with missing values from plot above)
outlier_vars = list(col_missing['vars'][col_missing['count'] >= 200000])
azdias_new = pd.DataFrame(azdias.drop(outlier_vars, axis=1))

#get the updated list of remaining features
new_vars_with_missing = [v for v in vars_with_missing if v not in outlier_vars]

outlier_vars

['TITEL_KZ',
 'AGER_TYP',
 'KK_KUNDENTYP',
 'KBA05_BAUMAX',
 'GEBURTSJAHR',
 'ALTER_HH']

# cnt = list(col_missing['count'])
# col_missing['count_z_score'] = (cnt-np.mean(cnt))/np.std(cnt)

# #Remove columns with z-score greater than 1.55 (beyond 84.38% percentile)
# outlier_vars = list(col_missing['vars'][col_missing['count_z_score'] >= 1.11])
# azdias_new = pd.DataFrame(azdias.drop(outlier_vars, axis=1))

# #get the updated list of remaining features
# new_vars_with_missing = [v for v in vars_with_missing if v not in outlier_vars]

# outlier_vars

azdias_new.head()

#visualize the location of missing values (for the remaining features after outliers removed)

fig = plt.subplots(figsize=(15,10)) 
sns.heatmap(azdias_new[new_vars_with_missing].isnull())

<matplotlib.axes._subplots.AxesSubplot at 0x10e8ba668>

Discussion 1.1.2: Assess Missing Data in Each Column¶

In total, there are 61 variables with missing values. Most of them have less than 200000 missing values. However, the feature: 'AGER_TYP','GEBURTSJAHR','TITEL_KZ','ALTER_HH','KK_KUNDENTYP','KBA05_BAUMAX' are with amount of missing values far beyond 200000(~22%). Those features are remove from the dataset. Also, from the heatmaps, we can see that missingness in rows with missging values run through multiple columns. This is a benefit since we can simply remove those rows with large amount of missginess to solve most of the missing values.

Step 1.1.3: Assess Missing Data in Each Row¶

# How much data is missing in each row of the dataset now?
row_with_missing = pd.DataFrame(data={'row_idx':azdias_new.index, 'count_missing': azdias_new.isnull().sum(axis=1)})

#how many rows with missing value
num_miss_row = azdias_new.index[azdias_new.isnull().any(axis=1)]

print('There are {} rows with totally {} missing values.'.format(len(num_miss_row), np.sum(row_with_missing['count_missing'])))

There are 268012 rows with totally 5035304 missing values.

#plot histogram of missingness in rows to choose a suitable threshold
plt.hist(row_with_missing['count_missing'])

(array([6.97109e+05, 7.76340e+04, 1.28010e+04, 9.53300e+03, 8.29000e+02,
        1.55000e+02, 1.40160e+04, 5.03100e+03, 2.80380e+04, 4.60750e+04]),
 array([ 0. ,  4.9,  9.8, 14.7, 19.6, 24.5, 29.4, 34.3, 39.2, 44.1, 49. ]),
 <a list of 10 Patch objects>)

# plt.hist(row_with_missing['count_z_score'])

# cnt = list(row_with_missing['count_missing'])
# row_with_missing['count_z_score'] = (cnt-np.mean(cnt))/np.std(cnt)

# #Remove columns with z-score less than 1.85 (under xx% percentile)
# azdias_new['subset_label'] = np.where(row_with_missing['count_z_score'] < 0.9, 'z-score < 1.9', 'z_score >= 1.9')
# print('There are {} rows in first subset and {} rows in the second subset.'.format(sum(azdias_new['subset_label']=='z-score < 1.9'), sum(azdias_new['subset_label']== 'z_score >= 1.9')))
# azdias_new.head(20)

# Write code to divide the data into two subsets based on the number of missing
# values in each row.

#assign subset labels to divide dataset into two groups
#from the histogram above, I choose 10 as the threshold of missing values to subset the data (suggested by the reviewer)

azdias_new['subset_label'] = np.where(row_with_missing['count_missing'] < 10, 'missingness < 10', 'missingness >= 10')

print('There are {} rows in first subset and {} rows in the second subset.'.format(sum(azdias_new['subset_label']=='missingness < 10'), sum(azdias_new['subset_label']== 'missingness >= 10')))
azdias_new.head(20)

There are 774743 rows in first subset and 116478 rows in the second subset.

#prepare several columns that are not missing data or a few missing data for comparison

col_with_missing = pd.DataFrame(data={'vars':azdias_new.columns, 'count_missing': azdias_new.isnull().sum(axis=0)})
col_with_missing.sort_values(by='count_missing', ascending=True, inplace=True)

#get first 10 features with smallest missingness
vars_for_plot = list(col_with_missing['vars'])[0:10]

#display the 10 features
col_with_missing.head(10)

# Compare the distribution of values for at least five columns where there are
# no or few missing values, between the two subsets.

for f in vars_for_plot:
    plt.figure()
    fig, ax = plt.subplots(figsize=(15,10))
    
    sns.countplot(ax=ax, x=f, hue="subset_label", data=azdias_new)
    plt.ylabel('count', fontsize=10)
    plt.xlabel(f, fontsize=10)
    plt.tick_params(axis='both', which='major', labelsize=10)
    plt.show();

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

#create the updated azdias dataset
azdias_new = pd.DataFrame(azdias_new[azdias_new['subset_label'] == 'missingness < 10'])

azdias_new.drop(['subset_label'], axis=1, inplace = True)

#visualize the location of missingness for the new dataset

fig = plt.subplots(figsize=(15,10)) 
sns.heatmap(azdias_new[new_vars_with_missing].isnull())

<matplotlib.axes._subplots.AxesSubplot at 0x10f7ae208>

Discussion 1.1.3: Assess Missing Data in Each Row¶

From the histogram, we can see that the amount of the missging values in each row is mostly under 10 (suggested by the reviewer). So, we can simply remove rows with more than 5 missging values without reducing many rows at the same time. From the countplots, we can see that the patterns, for some of the featrues, has no big difference in the data with lots of missing values than that in data with few or no missing values. However, for some of the them, there are differences.

Step 1.2: Select and Re-Encode Features¶

# How many features are there of each data type?
for v in outlier_vars:
    feat_info = feat_info[feat_info['attribute'] != v]
pd.DataFrame({'count' : feat_info.groupby(['type']).size()}).reset_index()

Step 1.2.1: Re-Encode Categorical Features¶

#get the list of categorical data
cat_mix_vars = feat_info['attribute'][feat_info['type'] == 'categorical']
cat_mix_vars[0:10]

2            ANREDE_KZ
3        CJT_GESAMTTYP
10           FINANZTYP
12     GFK_URLAUBERTYP
13    GREEN_AVANTGARDE
17     LP_FAMILIE_FEIN
18     LP_FAMILIE_GROB
19      LP_STATUS_FEIN
20      LP_STATUS_GROB
21    NATIONALITAET_KZ
Name: attribute, dtype: object

# Assess categorical variables: which are binary, which are multi-level, and
# which one needs to be re-encoded?
mat_cat_mix = pd.DataFrame([[v, pd.unique(azdias_new[v]), len(pd.unique(azdias_new[v])), 'binary' if len(pd.unique(azdias_new[v])) == 2 else 'multi'] for v in cat_mix_vars])
mat_cat_mix.columns = ['feature', 'level', 'number_of _levels', 'type']

#display levels of each categorical features
mat_cat_mix

It shows that ANREDE_KZ, GREEN_AVANTGARDE and SOHO_KZ are binary features,¶

OST_WEST_KZ needs to be re-encoded¶

and the rest are multi-levels¶

#drop features with more than 10 levels from azdias_new

varstodrop = list(mat_cat_mix['feature'][mat_cat_mix['number_of _levels'] > 10])
varstodrop.append('GEBAEUDETYP') #since there is no 5.0 level in customer data

azdias_new = pd.DataFrame(azdias_new.drop(varstodrop, axis=1))

varstodrop

['GFK_URLAUBERTYP', 'LP_FAMILIE_FEIN', 'CAMEO_DEU_2015', 'GEBAEUDETYP']

azdias_new.shape

(774743, 75)

# Re-encode the levels in ANREDE_KZ, SOHO_KZ and OST_WEST_KZ into [0,1]

azdias_new['ANREDE_KZ'] = azdias_new['ANREDE_KZ'].map({1:0, 2:1})
azdias_new['SOHO_KZ'] = azdias_new['SOHO_KZ'].map({0.0:0, 1.0:1})
azdias_new['OST_WEST_KZ'] = azdias_new['OST_WEST_KZ'].map({'W':0, 'O':1})

#remove from varstodrop
for v in varstodrop:
    mat_cat_mix.drop(mat_cat_mix[mat_cat_mix['feature'] == v].index, inplace=True)

#exempt binary features
#mat_cat_mix.drop(mat_cat_mix[mat_cat_mix['type'] == 'binary'].index, inplace=True)

mat_cat_mix

# Re-encode categorical variable(s) to be kept in the analysis.

vars_to_encode = list(mat_cat_mix['feature'])

azdias_encoded = pd.DataFrame(pd.get_dummies(azdias_new, prefix=vars_to_encode, columns=vars_to_encode))

azdias_encoded.reset_index(drop=True, inplace= True)

print(len(azdias_encoded))

azdias_encoded.head(10)

774743

vars_to_encode

['ANREDE_KZ',
 'CJT_GESAMTTYP',
 'FINANZTYP',
 'GREEN_AVANTGARDE',
 'LP_FAMILIE_GROB',
 'LP_STATUS_FEIN',
 'LP_STATUS_GROB',
 'NATIONALITAET_KZ',
 'SHOPPER_TYP',
 'SOHO_KZ',
 'VERS_TYP',
 'ZABEOTYP',
 'OST_WEST_KZ',
 'CAMEO_DEUG_2015']

Discussion 1.2.1: Re-Encode Categorical Features¶

I first came up with a matrix with feature name, levels, number_of_levels, type of each categorical features so that I have the map to perform featrue engineering. Then I drop features with more than 10 levels from the dataset, they are: 'GFK_URLAUBERTYP', 'LP_FAMILIE_FEIN', 'CAMEO_DEU_2015', 'GEBAEUDETYP'. After that, I re-encode the levels in ANREDE_KZ, SOHO_KZ and OST_WEST_KZ into 0,1. Finally, I re-encode categorical variable(s) to be kept in the analysis.

Step 1.2.2: Engineer Mixed-Type Features¶

There are a handful of features that are marked as "mixed" in the feature summary that require special treatment in order to be included in the analysis. There are two in particular that deserve attention; the handling of the rest are up to your own choices:

"PRAEGENDE_JUGENDJAHRE" combines information on three dimensions: generation by decade, movement (mainstream vs. avantgarde), and nation (east vs. west). While there aren't enough levels to disentangle east from west, you should create two new variables to capture the other two dimensions: an interval-type variable for decade, and a binary variable for movement.
"CAMEO_INTL_2015" combines information on two axes: wealth and life stage. Break up the two-digit codes by their 'tens'-place and 'ones'-place digits into two new ordinal variables (which, for the purposes of this project, is equivalent to just treating them as their raw numeric values).

#display mixed variables
mixed_vars = feat_info['attribute'][feat_info['type'] == 'mixed']

mixed_vars

15      LP_LEBENSPHASE_FEIN
16      LP_LEBENSPHASE_GROB
22    PRAEGENDE_JUGENDJAHRE
56                 WOHNLAGE
59          CAMEO_INTL_2015
79              PLZ8_BAUMAX
Name: attribute, dtype: object

mat_mixed = pd.DataFrame([[v, pd.unique(azdias_new[v]), len(pd.unique(azdias_new[v]))] for v in mixed_vars])
mat_mixed.columns = ['feature', 'level', 'number_of _levels']

#display levels of each mixed feature
mat_mixed

# Investigate "PRAEGENDE_JUGENDJAHRE" and engineer two new variables.

#it shows there are 16 out of 16 levels including nan.
print(mat_mixed.iloc[2,1])

#duplicate two "PRAEGENDE_JUGENDJAHRE" columns
azdias_encoded['PJ_decade'] = azdias_encoded['PRAEGENDE_JUGENDJAHRE']
azdias_encoded['PJ_movement'] = azdias_encoded['PRAEGENDE_JUGENDJAHRE']

[14. 15.  8.  3. 10. 11.  5.  9.  6.  4. nan  2.  1. 12. 13.  7.]

x = {1.0: 40, 2.0: 40, 3.0: 50, 4.0: 50, 5.0: 60, 6.0: 60, 7.0: 60, 8.0: 70, 9.0: 70, 10.0: 80, 11.0: 80, 12.0: 80, 13.0: 80, 14.0: 90, 15.0: 90}

y = {1.0: 'Mainstream', 2.0: 'Avantgarde', 3.0: 'Mainstream', 4.0: 'Avantgarde', 
     5.0: 'Mainstream', 6.0: 'Avantgarde', 7.0: 'Avantgarde', 8.0: 'Mainstream', 9: 'Avantgarde', 
     10.0: 'Mainstream', 11.0: 'Avantgarde', 12.0: 'Avantgarde', 13.0: 'Avantgarde', 14.0: 'Mainstream', 15.0: 'Avantgarde'}

#mapping and converting two newly created columns
azdias_encoded['PJ_decade'] = azdias_encoded['PJ_decade'].map(x)
azdias_encoded['PJ_movement'] = azdias_encoded['PJ_movement'].map(y)
azdias_encoded['PJ_movement'] = azdias_encoded['PJ_movement'].map({'Mainstream':0, 'Avantgarde':1})

# Investigate "CAMEO_INTL_2015" and engineer two new variables.

#it shows there are 22 out of 25 levels including nan.
print(mat_mixed.iloc[4,1])

#duplicate two "CAMEO_INTL_2015" columns
azdias_encoded['CAMEO_wealth'] = pd.DataFrame(azdias_encoded['CAMEO_INTL_2015'])
azdias_encoded['CAMEO_life'] = pd.DataFrame(azdias_encoded['CAMEO_INTL_2015'])

#map and convert
not_nan = azdias_encoded['CAMEO_INTL_2015'][azdias_encoded['CAMEO_INTL_2015'].notnull()]
azdias_encoded['CAMEO_wealth'][azdias_encoded['CAMEO_INTL_2015'].notnull()] = [int(r)%5 for r in not_nan]
#azdias_encoded['CAMEO_wealth'] = azdias_encoded['CAMEO_wealth'].map({0: 5}) map can be used with all unique values be mapped.
azdias_encoded['CAMEO_wealth'][azdias_encoded['CAMEO_wealth'] == 0] = 5
azdias_encoded['CAMEO_life'][azdias_encoded['CAMEO_INTL_2015'].notnull()] = [int(int(r)/10) for r in not_nan]

#check uniqueness
print(pd.unique(azdias_encoded['CAMEO_wealth']))
print(pd.unique(azdias_encoded['CAMEO_life']))

['51' '24' '12' '43' '54' '22' '14' '13' '15' '33' '41' '34' '55' '25' nan
 '23' '31' '52' '35' '45' '44' '32']

/Users/zhenhan/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
/Users/zhenhan/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

[1 4 2 3 5 nan]
[5 2 1 4 3 nan]

/Users/zhenhan/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app

plt.hist(azdias_encoded['WOHNLAGE'][~np.isnan(azdias_encoded['WOHNLAGE'])]);

plt.hist(azdias_encoded['PLZ8_BAUMAX'][~np.isnan(azdias_encoded['PLZ8_BAUMAX'])]);

#drop original columns and WOHNLAGE, PLZ8_BAUMAX, LP_LEBENSPHASE_FEIN and LP_LEBENSPHASE_GROB

drop_vars = ['PRAEGENDE_JUGENDJAHRE','WOHNLAGE','PLZ8_BAUMAX','CAMEO_INTL_2015','LP_LEBENSPHASE_FEIN','LP_LEBENSPHASE_GROB']

azdias_encoded = pd.DataFrame(azdias_encoded.drop(drop_vars, axis=1))

Discussion 1.2.2: Engineer Mixed-Type Features¶

Similar to what I did for categorical featrues. I first came up with a matrix with feature name, levels, number_of_levels of each Mixed features so that I have the map to perform featrue engineering. Then I map and convert 'CAMEO_INTL_2015' and 'PRAEGENDE_JUGENDJAHRE' into seperately two new columns.

WOHNLAGE : This variable is mostly ordinal from levels 1-5, depicting a building’s neighborhood quality from very good to very poor. The issue is that levels 7 and 8 (there’s no 6) are a flag for if the building is in a rural neighborhood, disconnected from neighborhood quality. Since separately re-encode them will cause massive missingness, and I do not want to encode the nan into a new column so that I decide to drop it, I decide to drop it.

PRAEGENDE_JUGENDJAHRE: This variable shows dominating movement of person's youth (avantgarde vs. mainstream; east Germany vs. west Germany) and scales from 1 to 15, with -1,0 unknown. I set x = {1.0: 40, 2.0: 40, 3.0: 50, 4.0: 50, 5.0: 60, 6.0: 60, 7.0: 60, 8.0: 70, 9.0: 70, 10.0: 80, 11.0: 80, 12.0: 80, 13.0: 80, 14.0: 90, 15.0: 90} and y = {1.0: 'Mainstream', 2.0: 'Avantgarde', 3.0: 'Mainstream', 4.0: 'Avantgarde', 5.0: 'Mainstream', 6.0: 'Avantgarde', 7.0: 'Avantgarde', 8.0: 'Mainstream', 9: 'Avantgarde', 10.0: 'Mainstream', 11.0: 'Avantgarde', 12.0: 'Avantgarde', 13.0: 'Avantgarde', 14.0: 'Mainstream', 15.0: 'Avantgarde'} Then I duplicate 'PRAEGENDE_JUGENDJAHRE' into 'PJ_decade' and 'PJ_movement' and map the former by x and the latter y. After that I remove 'PRAEGENDE_JUGENDJAHRE'.

CAMEO_INTL_2015: This variable shows German CAMEO: Wealth / Life Stage Typology, mapped to international code: 11-15, 21-25...51-55, with 1 and XX unknown. I duplicate 'CAMEO_INTL_2015' into 'CAMEO_wealth' and 'CAMEO_life'. Then I map the former by using trick int(r)%5 in order to take the remainder and map the remainder 0 to 5 so that we can have all wealth scales mapped. I map the CAMEO_life by using trick int(int(r)/10) in order to take the ten-digit number so that we can have all life scales mapped.

PLZ8_BAUMAX: It records most common building type within the PLZ8 region. Similar situation to WOHNLAGE, re-encoding it will cause massive missingess and I do not want to encode the nan into a new column so that I decide to drop it.

For featrue 'LP_LEBENSPHASE_FEIN' (Life stage, fine scale) and 'LP_LEBENSPHASE_GROB' (Life stage, rough scale), since the levels in LP_LEBENSPHASE_FEIN and LP_LEBENSPHASE_GROB are large (especially LP_LEBENSPHASE_FEIN with more than 40 levels created by 'LP_FAMILIE_FEIN', 'LP_FAMILIE_GROB', 'LP_STATUS_GROB'), I decide to drop them.

Step 1.2.3: Complete Feature Selection¶

# How much data is missing in each row of the dataset now?
row_with_missing = pd.DataFrame(data={'row_idx':azdias_encoded.index, 'count_missing': azdias_encoded.isnull().sum(axis=1)})

#how many rows with missing value
num_miss_row = azdias_encoded.index[azdias_encoded.isnull().any(axis=1)]

print('There are {} rows with totally {} missing values.'.format(len(num_miss_row), np.sum(row_with_missing['count_missing'])))

There are 151532 rows with totally 477018 missing values.

# If there are other re-engineering tasks you need to perform, make sure you
# take care of them here. (Dealing with missing data will come in step 2.1.)

# Do whatever you need to in order to ensure that the dataframe only contains
# the columns that should be passed to the algorithm functions.

#visualize the location of missingness for the new dataset

fig = plt.subplots(figsize=(15,10)) 
sns.heatmap(azdias_encoded.isnull())

<matplotlib.axes._subplots.AxesSubplot at 0x10bba4eb8>

Step 1.3: Create a Cleaning Function¶

# def clean_data(df, n_for_cols, n_for_rows):
#     """
#     Perform feature trimming, re-encoding, and engineering for demographics
#     data
    
#     INPUT: Demographics DataFrame
#     OUTPUT: Trimmed and cleaned demographics DataFrame
#     """
    
#     # Put in code here to execute all main cleaning steps:
#     # convert missing value codes into NaNs, ...
#     # Identify missing or unknown data values and convert them to NaNs.
    
# #     for f in list(azdias.columns):
# #     azdias[f][azdias[f].isnull()] = np.nan
    
#     feat_info = pd.read_csv("AZDIAS_Feature_Summary.csv", sep=';')
#     for number in range(0,len(feat_info)):
#         x = feat_info.missing_or_unknown[number] #Get the string expression of list
#         x = x.replace(" ", "") #remove blank
#         l = x[1:-1].split(',') #parse the elements
#         for i in l:
#             if i.lstrip('-').isdigit(): #if negative digit, convert to int
#                 df.iloc[:,number].replace(int(i), np.nan, inplace = True)
#             elif i.isdigit(): #if positive digit, convert to int
#                 df.iloc[:,number].replace(int(i), np.nan, inplace = True)
#             else: #For X or XX
#                 df.iloc[:,number].replace(i, np.nan, inplace = True)
    
#     # remove selected columns and rows, ...
    
#     #remove columns
#     col_missing = pd.DataFrame(data={'vars':df.columns, 'count_missing': df.isnull().sum(axis=0)})
#     outlier_vars = list(col_missing['vars'][col_missing['count_missing'] >= n_for_cols])
#     df = pd.DataFrame(df.drop(outlier_vars, axis=1))
    
#     #remove rows
#     row_with_missing = pd.DataFrame(data={'row_idx':df.index, 'count_missing': df.isnull().sum(axis=1)})
#     df = pd.DataFrame(df[row_with_missing['count_missing'] < n_for_rows])
    
#     # select, re-encode, and engineer column values.
    
#     #drop features with more than 10 levels from azdias_new
#     for v in outlier_vars:
#         feat_info = feat_info[feat_info['attribute'] != v]
#     cat_mix_vars = feat_info['attribute'][feat_info['type'] == 'categorical']
#     mat_cat_mix = pd.DataFrame([[v, pd.unique(df[v]), len(pd.unique(df[v])), 'binary' if len(pd.unique(df[v])) == 2 else 'multi'] for v in cat_mix_vars])
#     mat_cat_mix.columns = ['feature', 'level', 'number_of _levels', 'type']
#     varstodrop = mat_cat_mix['feature'][mat_cat_mix['number_of _levels'] > 10]
#     df = pd.DataFrame(df.drop(varstodrop, axis=1))
    
#     # Re-encode the levels in ANREDE_KZ, SOHO_KZ and OST_WEST_KZ into [0,1]
#     df['ANREDE_KZ'] = df['ANREDE_KZ'].map({1:0, 2:1})
#     df['SOHO_KZ'] = df['SOHO_KZ'].map({0.0:0, 1.0:1})
#     df['OST_WEST_KZ'] = df['OST_WEST_KZ'].map({'W':0, 'O':1})

#     #remove from varstodrop
#     for v in varstodrop:
#         mat_cat_mix.drop(mat_cat_mix[mat_cat_mix['feature'] == v].index, inplace=True)
#     #exempt binary features
#     mat_cat_mix.drop(mat_cat_mix[mat_cat_mix['type'] == 'binary'].index, inplace=True)
    
#     # Re-encode categorical variable(s) to be kept in the analysis.

#     vars_to_encode = list(mat_cat_mix['feature'])
#     df = pd.DataFrame(pd.get_dummies(df, prefix=vars_to_encode, columns=vars_to_encode))
#     df.reset_index(drop=True, inplace= True)
    
#     # mixed variables
#     mixed_vars = feat_info['attribute'][feat_info['type'] == 'mixed']
#     mat_mixed = pd.DataFrame([[v, pd.unique(df[v]), len(pd.unique(df[v]))] for v in mixed_vars])
#     mat_mixed.columns = ['feature', 'level', 'number_of _levels']
    
#     df['PJ_decade'] = df['PRAEGENDE_JUGENDJAHRE']
#     df['PJ_movement'] = df['PRAEGENDE_JUGENDJAHRE']
    
#     x = {1.0: 40, 2.0: 40, 3.0: 50, 4.0: 50, 5.0: 60, 6.0: 60, 7.0: 60, 8.0: 70, 9.0: 70, 10.0: 80, 11.0: 80, 12.0: 80, 13.0: 80, 14.0: 90, 15.0: 90}

#     y = {1.0: 'Mainstream', 2.0: 'Avantgarde', 3.0: 'Mainstream', 4.0: 'Avantgarde', 
#          5.0: 'Mainstream', 6.0: 'Avantgarde', 7.0: 'Avantgarde', 8.0: 'Mainstream', 9: 'Avantgarde', 
#          10.0: 'Mainstream', 11.0: 'Avantgarde', 12.0: 'Avantgarde', 13.0: 'Avantgarde', 14.0: 'Mainstream', 15.0: 'Avantgarde'}

#     #mapping and converting two newly created columns
#     df['PJ_decade'] = df['PJ_decade'].map(x)
#     df['PJ_movement'] = df['PJ_movement'].map(y)
#     df['PJ_movement'] = df['PJ_movement'].map({'Mainstream':0, 'Avantgarde':1})
    
#     #duplicate two "CAMEO_INTL_2015" columns
#     df['CAMEO_wealth'] = pd.DataFrame(df['CAMEO_INTL_2015'])
#     df['CAMEO_life'] = pd.DataFrame(df['CAMEO_INTL_2015'])

#     #map and convert
#     not_nan = df['CAMEO_INTL_2015'][df['CAMEO_INTL_2015'].notnull()]
#     df['CAMEO_wealth'][df['CAMEO_INTL_2015'].notnull()] = [int(r)%5 for r in not_nan]
#     #azdias_encoded['CAMEO_wealth'] = azdias_encoded['CAMEO_wealth'].map({0: 5}) map can be used with all unique values be mapped.
#     df['CAMEO_wealth'][df['CAMEO_wealth'] == 0] = 5
#     df['CAMEO_life'][df['CAMEO_INTL_2015'].notnull()] = [int(int(r)/10) for r in not_nan]
    
#     #drop original columns
#     drop_vars = ['PRAEGENDE_JUGENDJAHRE','CAMEO_INTL_2015']
#     df = pd.DataFrame(df.drop(drop_vars, axis=1))
    
#     #drop LP_LEBENSPHASE_FEIN and LP_LEBENSPHASE_GROB
#     drop_vars = ['LP_LEBENSPHASE_FEIN','LP_LEBENSPHASE_GROB']
#     df_new = pd.DataFrame(df.drop(drop_vars, axis=1))
    
#     # Return the cleaned dataframe.
    
#     return df_new

def clean_data(df, n_for_rows):
    """
    Perform feature trimming, re-encoding, and engineering for demographics
    data
    
    INPUT: Demographics DataFrame
    OUTPUT: Trimmed and cleaned demographics DataFrame
    """
    
    # Put in code here to execute all main cleaning steps:
    # convert missing value codes into NaNs, ...
    # Identify missing or unknown data values and convert them to NaNs.
    
#     for f in list(azdias.columns):
#     azdias[f][azdias[f].isnull()] = np.nan
    
    feat_info = pd.read_csv("AZDIAS_Feature_Summary.csv", sep=';')
    for number in range(0,len(feat_info)):
        x = feat_info.missing_or_unknown[number] #Get the string expression of list
        x = x.replace(" ", "") #remove blank
        l = x[1:-1].split(',') #parse the elements
        for i in l:
            if i.lstrip('-').isdigit(): #if negative digit, convert to int
                df.iloc[:,number].replace(int(i), np.nan, inplace = True)
            elif i.isdigit(): #if positive digit, convert to int
                df.iloc[:,number].replace(int(i), np.nan, inplace = True)
            else: #For X or XX
                df.iloc[:,number].replace(i, np.nan, inplace = True)
    
    # remove selected columns and rows, ...
    
    #remove columns
    outlier_vars = ['AGER_TYP', 'GEBURTSJAHR', 'TITEL_KZ', 'ALTER_HH', 'KK_KUNDENTYP', 'KBA05_BAUMAX']
    df = pd.DataFrame(df.drop(outlier_vars, axis=1))
    
    #remove rows
    row_with_missing = pd.DataFrame(data={'row_idx':df.index, 'count_missing': df.isnull().sum(axis=1)})
    df = pd.DataFrame(df[row_with_missing['count_missing'] < n_for_rows])
    
    # select, re-encode, and engineer column values.
    
    #drop features with more than 10 levels from azdias_new
    varstodrop = ['GFK_URLAUBERTYP','LP_FAMILIE_FEIN','CAMEO_DEU_2015', 'GEBAEUDETYP']
    df = pd.DataFrame(df.drop(varstodrop, axis=1))
    
    # Re-encode the levels in ANREDE_KZ, SOHO_KZ and OST_WEST_KZ into [0,1]
    df['ANREDE_KZ'] = df['ANREDE_KZ'].map({1:0, 2:1})
    df['SOHO_KZ'] = df['SOHO_KZ'].map({0.0:0, 1.0:1})
    df['OST_WEST_KZ'] = df['OST_WEST_KZ'].map({'W':0, 'O':1})
    
    vars_to_encode = ['CJT_GESAMTTYP','FINANZTYP','LP_FAMILIE_GROB','LP_STATUS_FEIN','LP_STATUS_GROB','NATIONALITAET_KZ','SHOPPER_TYP','VERS_TYP','ZABEOTYP','CAMEO_DEUG_2015','OST_WEST_KZ','ANREDE_KZ','GREEN_AVANTGARDE','SOHO_KZ']
    df = pd.DataFrame(pd.get_dummies(df, prefix=vars_to_encode, columns=vars_to_encode))
    df.reset_index(drop=True, inplace= True)
    
    # mixed variables
    df['PJ_decade'] = df['PRAEGENDE_JUGENDJAHRE']
    df['PJ_movement'] = df['PRAEGENDE_JUGENDJAHRE']
    
    x = {1.0: 40, 2.0: 40, 3.0: 50, 4.0: 50, 5.0: 60, 6.0: 60, 7.0: 60, 8.0: 70, 9.0: 70, 10.0: 80, 11.0: 80, 12.0: 80, 13.0: 80, 14.0: 90, 15.0: 90}

    y = {1.0: 'Mainstream', 2.0: 'Avantgarde', 3.0: 'Mainstream', 4.0: 'Avantgarde', 
         5.0: 'Mainstream', 6.0: 'Avantgarde', 7.0: 'Avantgarde', 8.0: 'Mainstream', 9: 'Avantgarde', 
         10.0: 'Mainstream', 11.0: 'Avantgarde', 12.0: 'Avantgarde', 13.0: 'Avantgarde', 14.0: 'Mainstream', 15.0: 'Avantgarde'}

    #mapping and converting two newly created columns
    df['PJ_decade'] = df['PJ_decade'].map(x)
    df['PJ_movement'] = df['PJ_movement'].map(y)
    df['PJ_movement'] = df['PJ_movement'].map({'Mainstream':0, 'Avantgarde':1})
    
    #duplicate two "CAMEO_INTL_2015" columns
    df['CAMEO_wealth'] = pd.DataFrame(df['CAMEO_INTL_2015'])
    df['CAMEO_life'] = pd.DataFrame(df['CAMEO_INTL_2015'])

    #map and convert
    not_nan = df['CAMEO_INTL_2015'][df['CAMEO_INTL_2015'].notnull()]
    df['CAMEO_wealth'][df['CAMEO_INTL_2015'].notnull()] = [int(r)%5 for r in not_nan]
    #azdias_encoded['CAMEO_wealth'] = azdias_encoded['CAMEO_wealth'].map({0: 5}) map can be used with all unique values be mapped.
    df['CAMEO_wealth'][df['CAMEO_wealth'] == 0] = 5
    df['CAMEO_life'][df['CAMEO_INTL_2015'].notnull()] = [int(int(r)/10) for r in not_nan]
    
    #drop columns
    drop_vars = ['PRAEGENDE_JUGENDJAHRE','WOHNLAGE','PLZ8_BAUMAX','CAMEO_INTL_2015','LP_LEBENSPHASE_FEIN','LP_LEBENSPHASE_GROB']
    df_new = pd.DataFrame(df.drop(drop_vars, axis=1))
    
    # Return the cleaned dataframe.
    
    return df_new

azdias = pd.read_csv("Udacity_AZDIAS_Subset.csv", sep=';')
azdias_encoded = clean_data(azdias, 10)

/Users/zhenhan/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:76: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/zhenhan/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:78: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/zhenhan/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:79: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

#azdias_no_na['GREEN_AVANTGARDE']

# outlier_vars = ['AGER_TYP', 'GEBURTSJAHR', 'TITEL_KZ', 'ALTER_HH', 'KK_KUNDENTYP', 'KBA05_BAUMAX']
# varstodrop = ['GFK_URLAUBERTYP','LP_FAMILIE_FEIN','CAMEO_DEU_2015']
#vars_to_encode = ['CJT_GESAMTTYP','FINANZTYP','LP_FAMILIE_GROB','LP_STATUS_FEIN','LP_STATUS_GROB','NATIONALITAET_KZ','SHOPPER_TYP','VERS_TYP','ZABEOTYP',''CAMEO_DEUG_2015']

Step 2: Feature Transformation¶

Step 2.1: Apply Feature Scaling¶

records_na = azdias_encoded.index[azdias_encoded.isnull().any(axis=1)]

print('There are {} rows out of total rows {} have with missing values ({:.2%}) '.format(len(records_na), len(azdias_encoded), len(records_na)/len(azdias_encoded)))
print('Since the amount of rows with missing values is not very large, I decide to remove these rows.')

There are 151532 rows out of total rows 774743 have with missing values (19.56%) 
Since the amount of rows with missing values is not very large, I decide to remove these rows.

# If you've not yet cleaned the dataset of all NaN values, then investigate and
# do that now.

azdias_no_na = pd.DataFrame(azdias_encoded.dropna())
azdias_no_na.reset_index(drop=True, inplace= True)

# Apply feature scaling to the general population demographics data.
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

azdias_no_na = pd.DataFrame(scaler.fit_transform(azdias_no_na))
azdias_no_na.columns = azdias_encoded.columns

azdias_no_na.head()

Discussion 2.1: Apply Feature Scaling¶

There are 73898 rows out of total rows 697109 have with missing values (10.60%). Since the amount of rows with missing values is not very large, I decide to simply remove these rows.

Step 2.2: Perform Dimensionality Reduction¶

# Apply PCA to the data.
from sklearn.decomposition import PCA

pca = PCA()
X_pca = pca.fit_transform(azdias_no_na)

#check the dimensionality of pca
pca.components_.shape

(123, 123)

# Investigate the variance accounted for by each principal component.

def scree_plot(pca): #referenced from helper_functions.py for Interpret_PC_Results_SC
    '''
    Creates a scree plot associated with the principal components 
    
    INPUT: pca - the result of instantian of PCA in scikit learn
            
    OUTPUT:
            None
    '''
    num_components = len(pca.explained_variance_ratio_)
    ind = np.arange(num_components)
    vals = pca.explained_variance_ratio_
 
    plt.figure(figsize=(25, 10))
    ax = plt.subplot(111)
    cumvals = np.cumsum(vals)
    ax.bar(ind, vals)
    ax.plot(ind, cumvals)
    for i in range(num_components):
        ax.annotate(r"%s%%" % ((str(vals[i]*100)[:4])), (ind[i]+0.2, vals[i]), va="bottom", ha="center", fontsize=12)
 
    ax.xaxis.set_tick_params(width=0)
    ax.yaxis.set_tick_params(width=2, length=12)
 
    ax.set_xlabel("Principal Component")
    ax.set_ylabel("Variance Explained (%)")
    plt.title('Explained Variance Per Principal Component')
    

scree_plot(pca)

#list the ratio of each component

ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1)

#Check how many components need to be kept to explain at least 85% of the variability in the original dataset
for i in range(1,len(ratios)+1):
    sum_variance = ratios[0:i].sum()
    if sum_variance > 0.85:
        num_comps = len(ratios[0:i])
        break

print("Using {} components, we can explain {}% of the variability in the original data.".format(num_comps,sum_variance))

Using 51 components, we can explain 0.8548327925674529% of the variability in the original data.

# Re-apply PCA to the data while selecting for number of components to retain.

#Re-apply PCA to select the first 51 components

pca = PCA(num_comps)
X_pca = pca.fit_transform(azdias_no_na)

Discussion 2.2: Perform Dimensionality Reduction¶

Since the intention of doing feature reduction with PCA is to reduce the dimensionality at the same time retain as much as information as we can and in PCA, we want to find where additional features do not add much more explained variability. Therefore, I decide to retain at least 85% of the variability in the original dataset since beyond 85%, more amount of components do not add much explained variability. There are 51 principal components retained.

Step 2.3: Interpret Principal Components¶

def pca_plot(df, pca, i_th_comp):
    '''
    Create a DataFrame of the PCA results
    Includes dimension feature weights and explained variance
    Visualizes the PCA results
    '''
    
    components_ = pca.components_[i_th_comp,:]
    explained_variance_ratio_= pca.explained_variance_ratio_[i_th_comp]


    # PCA components
    components = pd.DataFrame({'weights': np.round(components_, 4)})
    components.index = list(df.columns)
    components.sort_values(by='weights', ascending=True, inplace=True)
    components = pd.DataFrame(components.transpose())
    components.reset_index(drop=True, inplace = True)
    display(components)

    # Create a bar plot visualization
    fig = plt.subplots(figsize=(60,8)) 
    
    features = list(components.columns)
    y_pos = np.arange(len(features))
    weights = list(components.iloc[0])

    plt.bar(y_pos, weights, align='center', alpha=0.5)
    plt.xticks(y_pos, features, rotation = 90)
    plt.ylabel('Weights')
    #plt.title('')

    plt.show()

# Map weights for the first principal component to corresponding feature names
# and then print the linked values, sorted by weight.
# HINT: Try defining a function here or in a new cell that you can reuse in the
# other cells.

pca_plot(azdias_no_na, pca, 1)

# Map weights for the second principal component to corresponding feature names
# and then print the linked values, sorted by weight.

pca_plot(azdias_no_na, pca, 2)

# Map weights for the third principal component to corresponding feature names
# and then print the linked values, sorted by weight.

pca_plot(azdias_no_na, pca, 3)

Discussion 2.3: Interpret Principal Components¶

1st PC: For negative values, e.g. 'PJ_decade' and 'SEMIO_REL' those two features have large weights of the same sign (both negative), then increases in one tend expect to be associated with increases in the other. For posotive values, e.g. 'ZABEOTYP_3' and 'ALTERSKA...GROB' those two features have large weights of the same sign (both positive), then increases in one tend expect to be associated with increases in the other. Also, the sign represents the direction of correlation between the feature and the PC and the weights represent the features's contribution to the PC. Therefore, 'PJ_decade', 'SEMIO_REL' and 'ZABEOTYP_3', 'ALTERSKA...GROB' contribute a lot to the 1st PC in different direction. To contrast, those two gourps of features with different signs can be expected to show a negative correlation: increases in one variable should result in a decrease in the other.

2nd PC: For negative values, e.g. 'ANREDE_KZ1' and 'SEMIO_KAEM' those two features have large weights of the same sign (both negative), then increases in one tend expect to be associated with increases in the other. For posotive values, e.g. 'SEMIO_VERT' and 'ANREDE_KZ_0' those two features have large weights of the same sign (both positive), then increases in one tend expect to be associated with increases in the other. Also, the sign represents the direction of correlation between the feature and the PC and the weights represent the features's contribution to the PC. Therefore, 'ANREDE_KZ1', 'SEMIO_KAEM' and 'SEMIO_VERT', 'ANREDE_KZ_0' contribute a lot to the 1st PC in different direction. To contrast, those two gourps of features with different signs can be expected to show a negative correlation: increases in one variable should result in a decrease in the other.

3rd PC: For negative values, e.g. 'GREEN_AVANTGRADE_0' and 'WOHNLAGE' those two features have large weights of the same sign (both negative), then increases in one tend expect to be associated with increases in the other. For posotive values, e.g. 'PJ_movement' and 'GREEN_AVANTGRADE_1' those two features have large weights of the same sign (both positive), then increases in one tend expect to be associated with increases in the other. Also, the sign represents the direction of correlation between the feature and the PC and the weights represent the features's contribution to the PC. Therefore, 'GREEN_AVANTGRADE_0', 'LP_STATUS_GROB_4.0' and 'PJ_movement', 'GREEN_AVANTGRADE_1' contribute a lot to the 1st PC in different direction. To contrast, those two gourps of features with different signs can be expected to show a negative correlation: increases in one variable should result in a decrease in the other.

Step 3: Clustering¶

Step 3.1: Apply Clustering to General Population¶

# Over a number of different cluster counts...


    # run k-means clustering on the data and...
    
    
    # compute the average within-cluster distances.

def get_kmeans_score(data, center): #referenced from Changing K - Solution

    #instantiate kmeans
    kmeans = KMeans(n_clusters=center)

    #Then fit the model to data using the fit method
    model = kmeans.fit(data)
    
    #Obtain a score related to the model fit
    score = np.abs(model.score(data))
    
    return score

# Investigate the change in within-cluster distance across number of clusters.
# HINT: Use matplotlib's plot function to visualize this relationship.

scores = []
centers = list(range(1,30))

for center in centers:
    scores.append(get_kmeans_score(X_pca, center))
    
plt.plot(centers, scores, linestyle='--', marker='o', color='b');
plt.xlabel('K');
plt.ylabel('SSE');
plt.title('SSE vs. K');

plt.plot(centers, np.gradient(scores))
plt.xlabel('K')
plt.ylabel('Slope of SSE')
plt.title('SSE Slope vs. K')

Text(0.5,1,'SSE Slope vs. K')

# Re-fit the k-means model with the selected number of clusters and obtain
# cluster predictions for the general population demographics data.

#instantiate kmeans
kmeans = KMeans(n_clusters=15)

#Then fit the model to data using the fit method
model = kmeans.fit(X_pca)

azdias_label = pd.DataFrame({"Cluster_ID": model.predict(X_pca)})
azdias_pca = pd.DataFrame(data = X_pca, columns = ['Dimension {}'.format(i) for i in range(1,len(pca.components_)+1)])
azdias_pca = pd.concat([azdias_pca, azdias_label], axis=1)

azdias_pca.head()

Discussion 3.1: Apply Clustering to General Population¶

From the plot, we can see that SSE decreases with the number of clusters increading. Then I calculate the slope at each number of clusters. The slopes converge to almost zero and stabilize at 15. Therefore, I set 15 clusters to segment the population.

Step 3.2: Apply All Steps to the Customer Data¶

# Load in the customer demographics data.
customers = pd.read_csv("Udacity_CUSTOMERS_Subset.csv", sep=';')
customers.shape

(191652, 85)

# Investigate patterns in the amount of missing data in each column.
col_with_missing = pd.DataFrame(data={'vars':customers.columns, 'count_missing': customers.isnull().sum(axis=0)})
col_with_missing.reset_index(drop =True, inplace=True)
ax = col_with_missing.plot.bar(x='vars', y='count_missing', rot=90, figsize=(25,5))

# How much data is missing in each row of the dataset now?
row_with_missing = pd.DataFrame(data={'row_idx':customers.index, 'count_missing': customers.isnull().sum(axis=1)})

#how many rows with missing value
num_miss_row = customers.index[customers.isnull().any(axis=1)]

print('There are {} rows with totally {} missing values.'.format(len(num_miss_row), np.sum(row_with_missing['count_missing'])))

#plot histogram of missingness in rows to choose a suitable threshold
plt.hist(row_with_missing['count_missing'])

There are 122246 rows with totally 2252274 missing values.

(array([1.31646e+05, 4.27200e+03, 3.81100e+03, 1.71600e+03, 2.56000e+02,
        2.00000e+01, 4.00000e+00, 3.24200e+03, 0.00000e+00, 4.66850e+04]),
 array([ 0. ,  4.6,  9.2, 13.8, 18.4, 23. , 27.6, 32.2, 36.8, 41.4, 46. ]),
 <a list of 10 Patch objects>)

# Apply preprocessing, feature transformation, and clustering from the general
# demographics onto the customer data, obtaining cluster predictions for the
# customer demographics data.
customers_cleaned = clean_data(customers, 5)
customers_no_na = pd.DataFrame(customers_cleaned.dropna())
customers_no_na.reset_index(drop=True, inplace= True)
# Apply feature scaling
from sklearn.preprocessing import StandardScaler
customers_no_na = pd.DataFrame(scaler.transform(customers_no_na))
customers_no_na.columns = customers_cleaned.columns
X_hat_pca = pca.transform(customers_no_na)
#K-Means
customer_label = pd.DataFrame({"Cluster_ID": model.predict(X_hat_pca)})
customers_pca = pd.DataFrame(data = X_hat_pca, columns = ['Dimension {}'.format(i) for i in range(1,len(pca.components_)+1)])
customers_pca = pd.concat([customers_pca, customer_label], axis=1)

/Users/zhenhan/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:76: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/zhenhan/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:78: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/zhenhan/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:79: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

# a = list(customers_no_na.columns)
# b = list(azdias_no_na.columns)

# list(set(b)-set(a))

customers_pca.head()

Step 3.3: Compare Customer Data to Demographics Data¶

Which cluster or clusters are overrepresented in the customer dataset compared to the general population? Select at least one such cluster and infer what kind of people might be represented by that cluster. Use the principal component interpretations from step 2.3 or look at additional components to help you make this inference. Alternatively, you can use the .inverse_transform() method of the PCA and StandardScaler objects to transform centroids back to the original data space and interpret the retrieved values directly.
Perform a similar investigation for the underrepresented clusters. Which cluster or clusters are underrepresented in the customer dataset compared to the general population, and what kinds of people are typified by these clusters?

# Compare the proportion of data in each cluster for the customer data to the
# proportion of data in each cluster for the general population.

df_azdias_scaled_pca = azdias_pca
df_customers_scaled_pca = customers_pca
customers_na_cnt = customers.shape[0] - customers_cleaned.shape[0]
azdias_na_cnt = azdias.shape[0] - azdias_encoded.shape[0]
n_clusters = 15

azdias_pca['label'] = 'General'
customers_pca['label'] = 'Customer'
pred_label = pd.concat([azdias_pca, customers_pca])

# some trivial calculations below
plot_data = pred_label.groupby(['label'])['Cluster_ID'].value_counts().astype(float).rename("Count")
plot_data = pd.DataFrame(plot_data)
plot_data.reset_index(inplace=True)
# assign the large missingness data to a new cluster
ls = [['Customer', n_clusters, customers_na_cnt], ['General', n_clusters, azdias_na_cnt]]
for i, (label, cluster_id, cnt) in enumerate(ls):
    new_cluster = pd.Series({'label':label,'Cluster_ID': cluster_id,'Count': cnt})
    plot_data = plot_data.append(new_cluster, ignore_index=True)
# total number of counts in population and customer dataset, including the large-NaN data
total_cus = plot_data[plot_data.label == 'Customer'].Count.sum(axis = 0)
total_gen = plot_data[plot_data.label == 'General'].Count.sum(axis = 0)
# calculate the percentage distribution  
plot_data['Percentage'] = np.where(plot_data['label']=='Customer', plot_data['Count']/total_cus*100, plot_data['Count']/total_gen*100)

fig = plt.figure(figsize = (15,8))
ax = sns.barplot(x='Cluster_ID', y="Percentage", hue='label', data=plot_data)
ax.set_xlabel("Cluster ID", fontsize = 15)
ax.set_ylabel("Percentage (%)", fontsize = 15)
#ax.set_title(column, fontsize = 20)
ax.grid()
plt.annotate("The {}th Cluster represents the large missingness data".format(n_clusters), xy=(0.35, 1.02), xycoords='axes fraction', fontsize = 15);

# filter_col = [col for col in azdias_no_na if col.startswith('OST_WEST_KZ')]
# filter_col

# What kinds of people are part of a cluster that is overrepresented in the
# customer data compared to the general population?

centroid_id = [11,2]
centroids = []
cluster_center = kmeans.cluster_centers_[centroid_id,:]
result = pd.DataFrame(data = scaler.inverse_transform(pca.inverse_transform(cluster_center)), \
    columns = azdias_no_na.columns, index = ['Cluster 11 (overrepresenting)', 'Cluster 2 (underrepresenting)'])
# income-related features
income_col = ['HH_EINKOMMEN_SCORE','LP_STATUS_GROB_1.0','LP_STATUS_GROB_2.0',\
            'LP_STATUS_GROB_3.0','LP_STATUS_GROB_4.0','LP_STATUS_GROB_5.0',\
            'FINANZ_MINIMALIST', 'CAMEO_wealth']
# age-related features
age_col = ['ALTERSKATEGORIE_GROB', 'PJ_decade']
# gender-related features
gender_col = ['ANREDE_KZ_0','ANREDE_KZ_1']
# movement features
move_col = ['GREEN_AVANTGARDE_0', 'GREEN_AVANTGARDE_1', 'PJ_movement']
# building location
location_col = ['OST_WEST_KZ_1', 'OST_WEST_KZ_0']

features = income_col + age_col + gender_col + move_col + location_col

income = ['Income' for i in np.arange(len(income_col))]
age = ['Age' for i in np.arange(len(age_col))]
gender = ['Gender' for i in np.arange(len(gender_col))]
movement = ['Movement' for i in np.arange(len(move_col))]
location = ['Apartment Location' for i in np.arange(len(location_col))]

parents = income+age+gender+movement+location

meaning = ['Estimated household net income', 'low-income earners', 'average earners', \
           'independents', 'houseowners', 'top earners', 'low financial interest', 'Wealth Scale',\
          'Estimated age', 'Generation Decades', 'Male', 'Female', \
           'mainstream', 'avantgarde', 'mainstream vs. avantgarde', 'East Germany', 'West Germany']
summary = pd.DataFrame(result[features].T.values, \
                  index=[parents, features, meaning], columns = result.index)

summary

Discussion 3.3: Compare Customer Data to Demographics Data¶

From the tabel, we can see that the most overrepresenting cluster (cluster 0) is the group of people, with higher income (HH_EINKOMMEN_SCORE: lower scale indicates higher income) born around 1968, involved in green avantgarde, and living in west Germany, whereas people in the most underrepresenting cluster (cluster 11) are mostly female with lower income earners, born 1988, and not a member of green avantgarde who are the opposite of people in the most overrepresenting cluster.

	vars	count_missing	missing_perc (%)
0	TITEL_KZ	889061	99.76
1	AGER_TYP	685843	76.96
2	KK_KUNDENTYP	584612	65.60
3	KBA05_BAUMAX	476524	53.47
4	GEBURTSJAHR	392318	44.02
5	ALTER_HH	310267	34.81
6	KKK	158064	17.74
7	REGIOTYP	158064	17.74
8	W_KEIT_KIND_HH	147988	16.61
9	KBA05_ANTG3	133324	14.96
10	KBA05_ANTG1	133324	14.96
11	KBA05_ANTG2	133324	14.96
12	KBA05_GBZ	133324	14.96
13	KBA05_ANTG4	133324	14.96
14	MOBI_REGIO	133324	14.96
15	PLZ8_ANTG2	116515	13.07
16	PLZ8_ANTG1	116515	13.07
17	PLZ8_ANTG3	116515	13.07
18	PLZ8_ANTG4	116515	13.07
19	PLZ8_BAUMAX	116515	13.07
20	PLZ8_HHZ	116515	13.07
21	PLZ8_GBZ	116515	13.07
22	VERS_TYP	111196	12.48
23	SHOPPER_TYP	111196	12.48
24	HEALTH_TYP	111196	12.48
25	NATIONALITAET_KZ	108315	12.15
26	PRAEGENDE_JUGENDJAHRE	108164	12.14
27	KBA13_ANZAHL_PKW	105800	11.87
28	ANZ_HAUSHALTE_AKTIV	99611	11.18
29	CAMEO_INTL_2015	99352	11.15
30	CAMEO_DEUG_2015	99352	11.15
31	CAMEO_DEU_2015	99352	11.15
32	LP_LEBENSPHASE_FEIN	97632	10.95
33	ARBEIT	97375	10.93
34	RELAT_AB	97375	10.93
35	ORTSGR_KLS9	97274	10.91
36	ANZ_HH_TITEL	97008	10.88
37	LP_LEBENSPHASE_GROB	94572	10.61
38	INNENSTADT	93740	10.52
39	BALLRAUM	93740	10.52
40	EWDICHTE	93740	10.52
41	GEBAEUDETYP_RASTER	93155	10.45
42	WOHNLAGE	93148	10.45
43	OST_WEST_KZ	93148	10.45
44	MIN_GEBAEUDEJAHR	93148	10.45
45	GEBAEUDETYP	93148	10.45
46	LP_FAMILIE_GROB	77792	8.73
47	LP_FAMILIE_FEIN	77792	8.73
48	KONSUMNAEHE	73969	8.30
49	WOHNDAUER_2008	73499	8.25
50	ANZ_TITEL	73499	8.25
51	ANZ_PERSONEN	73499	8.25
52	SOHO_KZ	73499	8.25
53	HH_EINKOMMEN_SCORE	18348	2.06
54	RETOURTYP_BK_S	4854	0.54
55	LP_STATUS_GROB	4854	0.54
56	LP_STATUS_FEIN	4854	0.54
57	GFK_URLAUBERTYP	4854	0.54
58	CJT_GESAMTTYP	4854	0.54
59	ONLINE_AFFINITAET	4854	0.54
60	ALTERSKATEGORIE_GROB	2881	0.32

	vars	count_missing
ZABEOTYP	ZABEOTYP	0
SEMIO_TRADV	SEMIO_TRADV	0
SEMIO_PFLICHT	SEMIO_PFLICHT	0
SEMIO_KAEM	SEMIO_KAEM	0
SEMIO_DOM	SEMIO_DOM	0
SEMIO_KRIT	SEMIO_KRIT	0
SEMIO_RAT	SEMIO_RAT	0
SEMIO_KULT	SEMIO_KULT	0
SEMIO_ERL	SEMIO_ERL	0
SEMIO_LUST	SEMIO_LUST	0

	ALTERSKATEGORIE_GROB	FINANZ_MINIMALIST	FINANZ_SPARER	FINANZ_VORSORGER	FINANZ_ANLEGER	FINANZ_UNAUFFAELLIGER	FINANZ_HAUSBAUER	HEALTH_TYP	RETOURTYP_BK_S	SEMIO_SOZ	...	ANREDE_KZ_0	ANREDE_KZ_1	GREEN_AVANTGARDE_0	GREEN_AVANTGARDE_1	SOHO_KZ_0	SOHO_KZ_1	PJ_decade	PJ_movement	CAMEO_wealth	CAMEO_life
0	-1.746287	-1.512226	1.581061	-1.045045	1.539061	1.047076	1.340485	1.044646	-1.665693	0.388390	...	-0.977825	0.977825	0.553672	-0.553672	-10.801657	10.801657	1.164455	-0.602937	-1.257411	1.177631
1	0.202108	-1.512226	0.900446	-1.765054	-0.531624	0.318375	1.340485	1.044646	-0.290658	-0.123869	...	-0.977825	0.977825	-1.806125	1.806125	0.092578	-0.092578	1.164455	1.658548	0.758969	-0.869899
2	0.202108	0.692400	0.219832	0.394972	-1.221852	0.318375	-0.856544	1.044646	1.084378	0.900649	...	1.022678	-1.022678	0.553672	-0.553672	0.092578	-0.092578	-0.213395	-0.602937	0.086842	0.495121
3	-1.746287	-0.042475	-1.141397	1.114980	-0.531624	-0.410325	1.340485	1.044646	-0.290658	-1.148386	...	-0.977825	0.977825	0.553672	-0.553672	0.092578	-0.092578	-1.591245	-0.602937	0.758969	1.177631
4	-0.772089	-1.512226	1.581061	-1.765054	1.539061	1.047076	-0.124201	-0.273495	0.396860	-1.148386	...	-0.977825	0.977825	0.553672	-0.553672	0.092578	-0.092578	0.475530	-0.602937	-0.585284	-0.869899

	Dimension 1	Dimension 2	Dimension 3	Dimension 4	Dimension 5	Dimension 6	Dimension 7	Dimension 8	Dimension 9	Dimension 10	...	Dimension 43	Dimension 44	Dimension 45	Dimension 46	Dimension 47	Dimension 48	Dimension 49	Dimension 50	Dimension 51	Cluster_ID
0	5.021714	-2.575572	-3.383958	0.586823	-0.352625	-0.329136	2.185161	-1.029466	-0.068286	3.843568	...	0.391427	-1.470281	-1.383699	1.041256	-1.405406	-0.465333	-1.935224	1.349750	-1.356977	13
1	-0.167943	-0.525999	-3.411865	2.562590	-3.519639	1.374777	-0.056040	0.931929	-0.754016	-0.763818	...	1.495359	-0.872297	-0.417191	0.581117	0.144120	0.688093	0.043064	-0.662348	-0.938566	7
2	0.016382	0.018498	3.302479	0.375410	0.503502	-1.447855	3.448874	2.440503	-3.187455	-4.039641	...	0.600372	0.551858	0.976149	-0.571997	-0.608306	-0.316717	-0.523525	0.954453	-0.201711	7
3	-0.636184	-1.108356	-1.679470	-2.634749	-4.093347	0.375785	0.479211	0.172083	-1.161768	0.923805	...	-0.383403	0.262006	-0.675615	-0.226452	-0.671825	-0.153991	0.543032	0.422899	0.726072	8
4	1.780428	-3.530270	-3.769986	-0.620199	-1.680378	-0.321834	-1.019091	-1.497561	0.850013	-0.610410	...	1.743832	-0.251289	1.224359	-1.025259	0.787763	-0.469188	-0.786577	0.957872	0.459852	10

	Dimension 1	Dimension 2	Dimension 3	Dimension 4	Dimension 5	Dimension 6	Dimension 7	Dimension 8	Dimension 9	Dimension 10	...	Dimension 43	Dimension 44	Dimension 45	Dimension 46	Dimension 47	Dimension 48	Dimension 49	Dimension 50	Dimension 51	Cluster_ID
0	-7.183986	0.366557	3.699755	2.680096	-2.358042	0.821167	0.437212	-3.553714	0.028658	1.738494	...	-1.435217	0.567982	0.818363	0.862470	-0.330889	1.523359	0.047598	0.232167	-0.729795	11
1	-3.520021	4.640918	-0.803449	3.690601	-1.108786	-0.346336	-2.436571	0.395657	-0.857621	0.317275	...	-0.578397	0.650756	-0.588511	0.194856	-0.224588	-0.066171	-0.267442	-0.795579	-0.169817	1
2	1.106230	-0.723120	1.738615	-0.327879	1.005354	-2.467144	2.288124	-0.193434	-0.758722	-0.943777	...	-0.359847	0.107740	-0.763293	0.668254	-1.551412	1.589738	1.020952	-0.697533	-0.756014	3
3	-3.419595	0.459442	3.565843	3.598298	-1.169324	-0.590358	2.882020	3.816618	3.737976	2.444551	...	-1.180614	0.734572	-0.036200	0.540837	0.640490	-4.256851	-0.172702	-1.609689	1.827296	6
4	-7.220879	-0.725471	2.998015	1.846584	-1.587856	0.308693	-0.394186	-3.330859	-0.605118	2.051815	...	0.937691	-1.003837	-0.933288	-0.656177	0.120538	0.144730	0.897769	-0.601354	-0.602777	11

	AGER_TYP	ALTERSKATEGORIE_GROB	ANREDE_KZ	CJT_GESAMTTYP	FINANZ_MINIMALIST	FINANZ_SPARER	FINANZ_VORSORGER	FINANZ_ANLEGER	FINANZ_UNAUFFAELLIGER	FINANZ_HAUSBAUER	...	PLZ8_ANTG1	PLZ8_ANTG2	PLZ8_ANTG3	PLZ8_ANTG4	PLZ8_BAUMAX	PLZ8_HHZ	PLZ8_GBZ	ARBEIT	ORTSGR_KLS9	RELAT_AB
0	-1	2	1	2.0	3	4	3	5	5	3	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	-1	1	2	5.0	1	5	2	5	4	5	...	2.0	3.0	2.0	1.0	1.0	5.0	4.0	3.0	5.0	4.0
2	-1	3	2	3.0	1	4	1	2	3	5	...	3.0	3.0	1.0	0.0	1.0	4.0	4.0	3.0	5.0	2.0
3	2	4	2	2.0	4	2	5	2	1	2	...	2.0	2.0	2.0	0.0	1.0	3.0	4.0	2.0	3.0	3.0
4	-1	3	1	5.0	4	3	4	1	3	2	...	2.0	4.0	2.0	1.0	2.0	3.0	3.0	4.0	6.0	5.0

	attribute	information_level	type	missing_or_unknown
68	INNENSTADT	postcode	ordinal	[-1]
72	ONLINE_AFFINITAET	region_rr1	ordinal	[]
4	FINANZ_MINIMALIST	person	ordinal	[-1]
9	FINANZ_HAUSBAUER	person	ordinal	[-1]
40	TITEL_KZ	person	categorical	[-1,0]
11	GEBURTSJAHR	person	numeric	[0]
52	GEBAEUDETYP	building	categorical	[-1,0]
82	ARBEIT	community	ordinal	[-1,9]
5	FINANZ_SPARER	person	ordinal	[-1]
53	KONSUMNAEHE	building	ordinal	[]

	ALTERSKATEGORIE_GROB	ANREDE_KZ	CJT_GESAMTTYP	FINANZ_MINIMALIST	FINANZ_SPARER	FINANZ_VORSORGER	FINANZ_ANLEGER	FINANZ_UNAUFFAELLIGER	FINANZ_HAUSBAUER	FINANZTYP	...	PLZ8_ANTG1	PLZ8_ANTG2	PLZ8_ANTG3	PLZ8_ANTG4	PLZ8_BAUMAX	PLZ8_HHZ	PLZ8_GBZ	ARBEIT	ORTSGR_KLS9	RELAT_AB
0	2.0	1	2.0	3	4	3	5	5	3	4	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	1.0	2	5.0	1	5	2	5	4	5	1	...	2.0	3.0	2.0	1.0	1.0	5.0	4.0	3.0	5.0	4.0
2	3.0	2	3.0	1	4	1	2	3	5	1	...	3.0	3.0	1.0	0.0	1.0	4.0	4.0	3.0	5.0	2.0
3	4.0	2	2.0	4	2	5	2	1	2	6	...	2.0	2.0	2.0	0.0	1.0	3.0	4.0	2.0	3.0	3.0
4	3.0	1	5.0	4	3	4	1	3	2	5	...	2.0	4.0	2.0	1.0	2.0	3.0	3.0	4.0	6.0	5.0

	ALTERSKATEGORIE_GROB	ANREDE_KZ	CJT_GESAMTTYP	FINANZ_MINIMALIST	FINANZ_SPARER	FINANZ_VORSORGER	FINANZ_ANLEGER	FINANZ_UNAUFFAELLIGER	FINANZ_HAUSBAUER	FINANZTYP	...	PLZ8_ANTG2	PLZ8_ANTG3	PLZ8_ANTG4	PLZ8_BAUMAX	PLZ8_HHZ	PLZ8_GBZ	ARBEIT	ORTSGR_KLS9	RELAT_AB	subset_label
0	2.0	1	2.0	3	4	3	5	5	3	4	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	missingness >= 10
1	1.0	2	5.0	1	5	2	5	4	5	1	...	3.0	2.0	1.0	1.0	5.0	4.0	3.0	5.0	4.0	missingness < 10
2	3.0	2	3.0	1	4	1	2	3	5	1	...	3.0	1.0	0.0	1.0	4.0	4.0	3.0	5.0	2.0	missingness < 10
3	4.0	2	2.0	4	2	5	2	1	2	6	...	2.0	2.0	0.0	1.0	3.0	4.0	2.0	3.0	3.0	missingness < 10
4	3.0	1	5.0	4	3	4	1	3	2	5	...	4.0	2.0	1.0	2.0	3.0	3.0	4.0	6.0	5.0	missingness < 10
5	1.0	2	2.0	3	1	5	2	2	5	2	...	3.0	1.0	1.0	1.0	5.0	5.0	2.0	3.0	3.0	missingness < 10
6	2.0	2	5.0	1	5	1	5	4	3	4	...	3.0	1.0	0.0	1.0	5.0	5.0	4.0	6.0	3.0	missingness < 10
7	1.0	1	3.0	3	3	4	1	3	2	5	...	3.0	1.0	0.0	1.0	4.0	4.0	2.0	5.0	2.0	missingness < 10
8	3.0	1	3.0	4	4	2	4	2	2	6	...	3.0	2.0	1.0	1.0	3.0	3.0	2.0	4.0	3.0	missingness < 10
9	3.0	2	4.0	2	4	2	3	5	4	1	...	3.0	2.0	1.0	1.0	3.0	3.0	2.0	3.0	1.0	missingness < 10
10	3.0	2	1.0	2	2	5	3	1	5	6	...	4.0	2.0	0.0	2.0	3.0	3.0	4.0	6.0	5.0	missingness < 10
11	2.0	1	6.0	3	4	3	5	5	3	4	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	missingness >= 10
12	3.0	1	6.0	5	3	4	2	4	1	3	...	3.0	1.0	0.0	1.0	5.0	5.0	3.0	6.0	4.0	missingness < 10
13	1.0	2	5.0	1	4	3	5	5	2	1	...	1.0	1.0	1.0	1.0	3.0	3.0	3.0	6.0	4.0	missingness < 10
14	3.0	1	6.0	3	4	3	5	5	3	4	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	missingness >= 10
15	4.0	2	4.0	4	1	5	1	1	4	2	...	NaN	NaN	NaN	NaN	NaN	NaN	4.0	8.0	5.0	missingness < 10
16	1.0	2	1.0	4	3	1	4	5	1	3	...	3.0	1.0	0.0	1.0	3.0	4.0	1.0	2.0	1.0	missingness < 10
17	2.0	1	6.0	3	4	3	5	5	3	4	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	missingness >= 10
18	2.0	2	6.0	2	4	1	5	4	1	1	...	3.0	2.0	1.0	1.0	3.0	3.0	3.0	4.0	3.0	missingness < 10
19	3.0	1	3.0	5	2	3	1	3	1	5	...	4.0	2.0	1.0	2.0	5.0	4.0	4.0	6.0	3.0	missingness < 10

	feature	level	number_of _levels	type
0	ANREDE_KZ	[2, 1]	2	binary
1	CJT_GESAMTTYP	[5.0, 3.0, 2.0, 4.0, 1.0, 6.0]	6	multi
2	FINANZTYP	[1, 6, 5, 2, 4, 3]	6	multi
3	GFK_URLAUBERTYP	[10.0, 1.0, 5.0, 12.0, 9.0, 3.0, 8.0, 11.0, 4....	12	multi
4	GREEN_AVANTGARDE	[0, 1]	2	binary
5	LP_FAMILIE_FEIN	[5.0, 1.0, nan, 10.0, 2.0, 7.0, 11.0, 8.0, 4.0...	12	multi
6	LP_FAMILIE_GROB	[3.0, 1.0, nan, 5.0, 2.0, 4.0]	6	multi
7	LP_STATUS_FEIN	[2.0, 3.0, 9.0, 4.0, 1.0, 10.0, 5.0, 8.0, 6.0,...	10	multi
8	LP_STATUS_GROB	[1.0, 2.0, 4.0, 5.0, 3.0]	5	multi
9	NATIONALITAET_KZ	[1.0, 3.0, 2.0, nan]	4	multi
10	SHOPPER_TYP	[3.0, 2.0, 1.0, 0.0, nan]	5	multi
11	SOHO_KZ	[1.0, 0.0]	2	binary
12	VERS_TYP	[2.0, 1.0, nan]	3	multi
13	ZABEOTYP	[5, 3, 4, 1, 6, 2]	6	multi
14	GEBAEUDETYP	[8.0, 1.0, 3.0, 2.0, 6.0, 4.0, 5.0]	7	multi
15	OST_WEST_KZ	[W, O]	2	binary
16	CAMEO_DEUG_2015	[8, 4, 2, 6, 1, 9, 5, 7, 3, nan]	10	multi
17	CAMEO_DEU_2015	[8A, 4C, 2A, 6B, 8C, 4A, 2D, 1A, 1E, 9D, 5C, 8...	45	multi

	ALTERSKATEGORIE_GROB	FINANZ_MINIMALIST	FINANZ_SPARER	FINANZ_VORSORGER	FINANZ_ANLEGER	FINANZ_UNAUFFAELLIGER	FINANZ_HAUSBAUER	HEALTH_TYP	LP_LEBENSPHASE_FEIN	LP_LEBENSPHASE_GROB	...	CAMEO_DEUG_2015_1	CAMEO_DEUG_2015_2	CAMEO_DEUG_2015_4	CAMEO_DEUG_2015_6	CAMEO_DEUG_2015_8	CAMEO_DEUG_2015_9
0	1.0	1	5	2	5	4	5	3.0	21.0	6.0	...	0	0	0	0	1	0
1	3.0	1	4	1	2	3	5	3.0	3.0	1.0	...	0	0	1	0	0	0
2	4.0	4	2	5	2	1	2	2.0	NaN	NaN	...	0	1	0	0	0	0
3	3.0	4	3	4	1	3	2	3.0	32.0	10.0	...	0	0	0	1	0	0
4	1.0	3	1	5	2	2	5	3.0	8.0	2.0	...	0	0	0	0	1	0
5	2.0	1	5	1	5	4	3	2.0	2.0	1.0	...	0	0	1	0	0	0
6	1.0	3	3	4	1	3	2	1.0	5.0	2.0	...	0	1	0	0	0	0
7	3.0	4	4	2	4	2	2	3.0	10.0	3.0	...	1	0	0	0	0	0
8	3.0	2	4	2	3	5	4	2.0	4.0	1.0	...	1	0	0	0	0	0
9	3.0	2	2	5	3	1	5	2.0	6.0	2.0	...	0	0	0	0	0	1

	feature	level	number_of _levels
0	LP_LEBENSPHASE_FEIN	[21.0, 3.0, nan, 32.0, 8.0, 2.0, 5.0, 10.0, 4....	41
1	LP_LEBENSPHASE_GROB	[6.0, 1.0, nan, 10.0, 2.0, 3.0, 5.0, 7.0, 12.0...	13
2	PRAEGENDE_JUGENDJAHRE	[14.0, 15.0, 8.0, 3.0, 10.0, 11.0, 5.0, 9.0, 6...	16
3	WOHNLAGE	[4.0, 2.0, 7.0, 3.0, 5.0, 1.0, 8.0, 0.0]	8
4	CAMEO_INTL_2015	[51, 24, 12, 43, 54, 22, 14, 13, 15, 33, 41, 3...	22
5	PLZ8_BAUMAX	[1.0, 2.0, nan, 4.0, 5.0, 3.0]	6

			Cluster 5 (overrepresenting)	Cluster 13 (underrepresenting)
Income	HH_EINKOMMEN_SCORE	Estimated household net income	2.378996	5.636359
	LP_STATUS_GROB_1.0	low-income earners	0.046981	0.916476
	LP_STATUS_GROB_2.0	average earners	0.000574	0.078646
	LP_STATUS_GROB_3.0	independents	-0.001542	0.007071
	LP_STATUS_GROB_4.0	houseowners	0.011603	-0.005166
	LP_STATUS_GROB_5.0	top earners	0.942385	0.002972
	FINANZ_MINIMALIST	low financial interest	4.767436	1.769617
	CAMEO_wealth	Wealth Scale	2.854706	2.873221
Age	ALTERSKATEGORIE_GROB	Estimated age	3.143970	1.831946
Age	PJ_decade	Generation Decades	67.980166	88.111310
Gender	ANREDE_KZ_0	Male	0.959185	1.038176
Gender	ANREDE_KZ_1	Female	0.040815	-0.038176
Movement	GREEN_AVANTGARDE_0	mainstream	-0.002962	0.917584
	GREEN_AVANTGARDE_1	avantgarde	1.002962	0.082416
	PJ_movement	mainstream vs. avantgarde	1.016037	0.135094
Apartment Location	OST_WEST_KZ_1	East Germany	0.089847	0.320586
Apartment Location	OST_WEST_KZ_0	West Germany	0.910153	0.679414

	AGER_TYP	ALTERSKATEGORIE_GROB	ANREDE_KZ	CJT_GESAMTTYP	FINANZ_MINIMALIST	FINANZ_SPARER	FINANZ_VORSORGER	FINANZ_ANLEGER	FINANZ_UNAUFFAELLIGER	FINANZ_HAUSBAUER	...	PLZ8_ANTG1	PLZ8_ANTG2	PLZ8_ANTG3	PLZ8_ANTG4	PLZ8_BAUMAX	PLZ8_HHZ	PLZ8_GBZ	ARBEIT	ORTSGR_KLS9	RELAT_AB
0	-1	2	1	2.0	3	4	3	5	5	3	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	-1	1	2	5.0	1	5	2	5	4	5	...	2.0	3.0	2.0	1.0	1.0	5.0	4.0	3.0	5.0	4.0
2	-1	3	2	3.0	1	4	1	2	3	5	...	3.0	3.0	1.0	0.0	1.0	4.0	4.0	3.0	5.0	2.0
3	2	4	2	2.0	4	2	5	2	1	2	...	2.0	2.0	2.0	0.0	1.0	3.0	4.0	2.0	3.0	3.0
4	-1	3	1	5.0	4	3	4	1	3	2	...	2.0	4.0	2.0	1.0	2.0	3.0	3.0	4.0	6.0	5.0

	ALTERSKATEGORIE_GROB	ANREDE_KZ	CJT_GESAMTTYP	FINANZ_MINIMALIST	FINANZ_SPARER	FINANZ_VORSORGER	FINANZ_ANLEGER	FINANZ_UNAUFFAELLIGER	FINANZ_HAUSBAUER	FINANZTYP	...	PLZ8_ANTG1	PLZ8_ANTG2	PLZ8_ANTG3	PLZ8_ANTG4	PLZ8_BAUMAX	PLZ8_HHZ	PLZ8_GBZ	ARBEIT	ORTSGR_KLS9	RELAT_AB
0	2.0	1	2.0	3	4	3	5	5	3	4	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	1.0	2	5.0	1	5	2	5	4	5	1	...	2.0	3.0	2.0	1.0	1.0	5.0	4.0	3.0	5.0	4.0
2	3.0	2	3.0	1	4	1	2	3	5	1	...	3.0	3.0	1.0	0.0	1.0	4.0	4.0	3.0	5.0	2.0
3	4.0	2	2.0	4	2	5	2	1	2	6	...	2.0	2.0	2.0	0.0	1.0	3.0	4.0	2.0	3.0	3.0
4	3.0	1	5.0	4	3	4	1	3	2	5	...	2.0	4.0	2.0	1.0	2.0	3.0	3.0	4.0	6.0	5.0

	AGER_TYP	ALTERSKATEGORIE_GROB	ANREDE_KZ	CJT_GESAMTTYP	FINANZ_MINIMALIST	FINANZ_SPARER	FINANZ_VORSORGER	FINANZ_ANLEGER	FINANZ_UNAUFFAELLIGER	FINANZ_HAUSBAUER	...	PLZ8_ANTG1	PLZ8_ANTG2	PLZ8_ANTG3	PLZ8_ANTG4	PLZ8_BAUMAX	PLZ8_HHZ	PLZ8_GBZ	ARBEIT	ORTSGR_KLS9	RELAT_AB
0	-1	2	1	2.0	3	4	3	5	5	3	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	-1	1	2	5.0	1	5	2	5	4	5	...	2.0	3.0	2.0	1.0	1.0	5.0	4.0	3.0	5.0	4.0
2	-1	3	2	3.0	1	4	1	2	3	5	...	3.0	3.0	1.0	0.0	1.0	4.0	4.0	3.0	5.0	2.0
3	2	4	2	2.0	4	2	5	2	1	2	...	2.0	2.0	2.0	0.0	1.0	3.0	4.0	2.0	3.0	3.0
4	-1	3	1	5.0	4	3	4	1	3	2	...	2.0	4.0	2.0	1.0	2.0	3.0	3.0	4.0	6.0	5.0

	ALTERSKATEGORIE_GROB	ANREDE_KZ	CJT_GESAMTTYP	FINANZ_MINIMALIST	FINANZ_SPARER	FINANZ_VORSORGER	FINANZ_ANLEGER	FINANZ_UNAUFFAELLIGER	FINANZ_HAUSBAUER	FINANZTYP	...	PLZ8_ANTG1	PLZ8_ANTG2	PLZ8_ANTG3	PLZ8_ANTG4	PLZ8_BAUMAX	PLZ8_HHZ	PLZ8_GBZ	ARBEIT	ORTSGR_KLS9	RELAT_AB
0	2.0	1	2.0	3	4	3	5	5	3	4	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	1.0	2	5.0	1	5	2	5	4	5	1	...	2.0	3.0	2.0	1.0	1.0	5.0	4.0	3.0	5.0	4.0
2	3.0	2	3.0	1	4	1	2	3	5	1	...	3.0	3.0	1.0	0.0	1.0	4.0	4.0	3.0	5.0	2.0
3	4.0	2	2.0	4	2	5	2	1	2	6	...	2.0	2.0	2.0	0.0	1.0	3.0	4.0	2.0	3.0	3.0
4	3.0	1	5.0	4	3	4	1	3	2	5	...	2.0	4.0	2.0	1.0	2.0	3.0	3.0	4.0	6.0	5.0