from enum import IntEnum
import numpy as np
import scipy
from scipy import stats
import pandas as pd
class Features(IntEnum):
Age = 0
Income = 1
Savings = 2
Credit_income_ratio = 3
Economimc_factor = 4
ZIP_code = 5
[docs]
class CreditDefaultData:
[docs]
@staticmethod
def sample(n_data: int, seed: int=None, constant=-1.0,
cov:np.ndarray=None)->pd.DataFrame:
"""Sample credit default data.
Return a pandas DataFrame that contains some credit features together with the default probability
and an indicator if the default occured (1) or if the credit did not default (0). The data is generated by a logistic regression
where the pd for a credit is computed by logistic regression (with fixed coefficients). The following features
are used
- :math:`x_{\\mbox{age}}` age of lender, sampled from beta distribution (a=2, b=5)
- :math:`x_{\\mbox{income}}` income of lender, sampled from beta distribution (a=2.0, b=2.0)
- :math:`x_{\\mbox{savings}}` savings of lender, sampled from beta distribution (a=5.0, b=1.0)
- :math:`x_{\\mbox{amount}}` amount of credit, sampled from beta distribution (a=0.5, b=0.5)
- :math:`x_{\\mbox{region}}` one hot encoded feature indicating one of three regions the lender lives in. The region are uniformly distributed
The single features (exception is the region that is drawn independently of the other features) are related via a Gaussian copula. The following figure showsthe distributions and pairplots for a
generated sample of features.
.. image:: ../../../docs/source/figs/credit_default_features.png
:align: center
:width: 400
After the features have been generated, logistic regression is used to compute default probabilities (pd) via the formula
.. math::
pd = \\frac{1}{1+e^{x_{\\mbox{age}}}
Args:
n_data (int): Number of data sampled (number of rows of final DataFrame).
seed (int, optional): The seed used internally, if None, no seed will be set. Defaults to None.
constant (float, optional): Constant used in logistic regression that determines the overall level of the pd. Defaults to -1.0.
cov (np.ndarray, optional): Covariance matrix used in the Gaussian copula. Defaults to None (thena flat covariance of 0.95 is used).
Returns:
pd.DataFrame: DataFrame with features, default probabilities and default indicator.
"""
if seed is not None:
np.random.seed(seed)
if cov is None:
cov = np.array([[1.0,0.95,0.95,0.95],
[0.95,1.0,0.95,0.95],
[0.95,0.95,1.0,0.95],
[0.95,0.95,0.95,1.0],])
mean = np.array([0.0,0.0,0.0,0.0])
x_ = np.random.multivariate_normal(mean=mean, cov=cov, size=n_data)
x_ = pd.DataFrame(x_, columns=['age','income','savings','credit_amount_perc'])
beta_params={'age': {'a': 2.0,'b': 5.0},
'income':{'a':2.0, 'b': 2.0},
'savings':{'a':5.0,'b':1.0},
'credit_amount_perc':{'a':0.5,'b': 0.5}}
for c in x_.columns:
x_[c] = scipy.stats.beta.ppf(scipy.stats.norm.cdf(x_[c].values), **beta_params[c])
result = x_
#result = {}
#result['age'] = np.random.beta(a=2.0,b=5.0, size=n_data)
#result['income'] = np.random.beta(a=2.0, b=2.0, size=n_data)
#result['savings'] = np.random.uniform(low=0.0, high=1.0)
#result['credit_amount_perc'] = np.random.uniform(low=0.5,high=5.0, size=n_data)
region = [None]*3
for i in range(len(region)):
region[i] = np.zeros((n_data,))
_tmp = np.random.randint(low=0, high=len(region), size=n_data)
for i in range(_tmp.shape[0]):
region[_tmp[i]][i] = 1
for i in range(len(region)):
result['region_'+str(i)]=region[i]
result['regime'] = constant
df = pd.DataFrame(result)
default_prob = CreditDefaultData._predict(df)
df['default_prob'] = default_prob
tmp = np.random.uniform(low=0.0,high=1.0, size=n_data)
defaulted = np.zeros((n_data,))
defaulted[tmp<default_prob] = 1.0
df['defaulted'] = defaulted
return df
@staticmethod
def _predict(df: pd.DataFrame)->np.ndarray:
"""This method computes the pd and is called by the method sample.
Args:
df (pd.DataFrame): Pandas DataFrame as constructed within the sample method.
Returns:
np.ndarray: Vector of default probabilities.
"""
beta0 = [0.5, 1.1, 0.7]
tmp = (0.5-df['age'])**2 - 0.3*df['credit_amount_perc']#
x0 = 5.0*(0.5 * df['region_0'] + 1.1 * df['region_1'] + 0.7*df['region_2'])*tmp
x1 = 1.5*(df['income'])**2
x2 = 1.5*df['savings']
x3 = df['regime']
return 1.0/(1.0+np.exp(2.0*(x0+x1+x2+x3)))
class CreditDefaultData2:
@staticmethod
def sample(n_years: int, n_data_per_year: int, seed: int=None,
cov:np.ndarray=None,
include_economic_factor: bool=True)->pd.DataFrame:
"""Sample credit default data.
Return a pandas DataFrame that contains some credit features together with the default probability
and an indicator if the default occured (1) or if the credit did not default (0). The data is generated by a logistic regression
where the pd for a credit is computed by logistic regression (with fixed coefficients). The following features
are used
- :math:`x_{\\mbox{age}}` age of lender, sampled from beta distribution (a=2, b=5)
- :math:`x_{\\mbox{income}}` income of lender, sampled from beta distribution (a=2.0, b=2.0)
- :math:`x_{\\mbox{savings}}` savings of lender, sampled from beta distribution (a=5.0, b=1.0)
- :math:`x_{\\mbox{amount}}` amount of credit, sampled from beta distribution (a=0.5, b=0.5)
- :math:`x_{\\mbox{economic}}` economic factor, uniformly sampled from [0,0.5] for each year but one of the samples will be set to 1.0
The single features are related via a Gaussian copula. The following figure shows the distributions and pairplots for a
generated sample of features.
.. image:: ../../../docs/source/figs/credit_default_features.png
:align: center
:width: 400
After the features have been generated, logistic regression is used to compute default probabilities (pd) via the formula
.. math::
pd = \\frac{1}{1+e^{x_{\\mbox{age}}}
Args:
n_data (int): Number of data sampled (number of rows of final DataFrame).
seed (int, optional): The seed used internally, if None, no seed will be set. Defaults to None.
constant (float, optional): Constant used in logistic regression that determines the overall level of the pd. Defaults to -1.0.
cov (np.ndarray, optional): Covariance matrix used in the Gaussian copula. Defaults to None (thena flat covariance of 0.95 is used).
Returns:
pd.DataFrame: DataFrame with features, default probabilities and default indicator.
"""
if seed is not None:
np.random.seed(seed)
if cov is None:
cov = np.array([[1.0,0.95,0.95,0.95],
[0.95,1.0,0.95,0.95],
[0.95,0.95,1.0,0.95],
[0.95,0.95,0.95,1.0],])
mean = np.array([0.0,0.0,0.0,0.0])
if include_economic_factor:
economic_score = np.random.uniform(0, 0.5, size=n_years)
economic_score[-1] = 1.0
if include_economic_factor:
x = pd.DataFrame(np.empty((n_years*n_data_per_year, 5)),
columns=['age','income','savings','credit_income_ratio' , 'economic_factor'])
else:
x = pd.DataFrame(np.empty((n_years*n_data_per_year, 4)),
columns=['age','income','savings','credit_income_ratio'])
for y in range(n_years):
start = y*n_data_per_year
end = start + n_data_per_year
if include_economic_factor:
x['economic_factor'][start:end] = economic_score[y]
x_ = np.random.multivariate_normal(mean=mean, cov=cov, size=n_data_per_year)
x_ = pd.DataFrame(x_,
columns=['age','income','savings','credit_income_ratio',])
beta_params={'age': {'a': 2.0,'b': 5.0},
'income':{'a':2.0, 'b': 2.0},
'savings':{'a':5.0,'b':1.0},
'credit_income_ratio':{'a':0.5,'b': 0.5}}
for c in beta_params.keys():
x[c][start:end] = scipy.stats.beta.ppf(scipy.stats.norm.cdf(x_[c].values), **beta_params[c])
df = pd.DataFrame(x)
default_prob = CreditDefaultData2._predict(df.values)
df['default_prob'] = default_prob
tmp = np.random.uniform(low=0.0,high=1.0, size=n_years*n_data_per_year)
defaulted = np.zeros((n_years*n_data_per_year,))
defaulted[tmp<default_prob] = 1.0
df['defaulted'] = defaulted
return df
@staticmethod
def _predict(X: np.ndarray)->np.ndarray:
"""This method computes the pd and is called by the method sample.
Args:
df (np.ndarray):
Returns:
np.ndarray: Vector of default probabilities.
"""
age = X[:,Features.Age]
age = 5.0*(1.0-age)*age
credit_income_ratio = -5.0*0.3*X[:, Features.Credit_income_ratio]
x1 = 1.5*(X[:, Features.Income])**2
x2 = 1.5*X[:, Features.Savings]
if X.shape[1] == 5: # if the given data does not contain the economic factor, set it to 1.0
x3 = 1.0-X[:, Features.Economimc_factor]
else:
x3 = 0.0
return 1.0/(1.0+np.exp(2.0*(age+credit_income_ratio + x1+x2+x3)))
class CreditDefaultDataCategoricalFeature:
zip_code_score = [0.05, 0.13, 0.275, 0.59, 0.42, 0.05,0.87,0.05, 0.21, 0.69]
@staticmethod
def sample(n_years: int, n_data_per_year: int, seed: int=None,
cov:np.ndarray=None, include_economic_factor: bool=True)->pd.DataFrame:
"""Sample credit default data.
Return a pandas DataFrame that contains some credit features together with the default probability
and an indicator if the default occured (1) or if the credit did not default (0). The data is generated by a logistic regression
where the pd for a credit is computed by logistic regression (with fixed coefficients). The following features
are used
- :math:`x_{\\mbox{age}}` age of lender, sampled from beta distribution (a=2, b=5)
- :math:`x_{\\mbox{income}}` income of lender, sampled from beta distribution (a=2.0, b=2.0)
- :math:`x_{\\mbox{savings}}` savings of lender, sampled from beta distribution (a=5.0, b=1.0)
- :math:`x_{\\mbox{amount}}` amount of credit, sampled from beta distribution (a=0.5, b=0.5)
- :math:`x_{\\mbox{economic}}` economic factor, uniformly sampled from [0,0.5] but one of the samples will be set to 1.0
- :math: _{\\mbox{zip}}` zip code of lender, sampled from categorical distribution (with 10 categories). Each zipcode
The single features (except the zip code) are related via a Gaussian copula. The following figure showsthe distributions and pairplots for a
generated sample of features.
.. image:: ../../../docs/source/figs/credit_default_features.png
:align: center
:width: 400
After the features have been generated, logistic regression is used to compute default probabilities (pd) via the formula
.. math::
pd = \\frac{1}{1+e^{x_{\\mbox{age}}}
Args:
n_data (int): Number of data sampled (number of rows of final DataFrame).
seed (int, optional): The seed used internally, if None, no seed will be set. Defaults to None.
constant (float, optional): Constant used in logistic regression that determines the overall level of the pd. Defaults to -1.0.
cov (np.ndarray, optional): Covariance matrix used in the Gaussian copula. Defaults to None (thena flat covariance of 0.95 is used).
Returns:
pd.DataFrame: DataFrame with features, default probabilities and default indicator.
"""
if seed is not None:
np.random.seed(seed)
if cov is None:
cov = np.array([[1.0,0.95,0.95,0.95],
[0.95,1.0,0.95,0.95],
[0.95,0.95,1.0,0.95],
[0.95,0.95,0.95,1.0],])
mean = np.array([0.0,0.0,0.0,0.0])
if include_economic_factor:
economic_score = np.random.uniform(0, 0.5, size=n_years)
economic_score[-1] = 1.0
x = pd.DataFrame(np.empty((n_years*n_data_per_year, 5)),
columns=['age','income','savings','credit_income_ratio' , 'economic_factor'])
else:
x = pd.DataFrame(np.empty((n_years*n_data_per_year, 4)),
columns=['age','income','savings','credit_income_ratio' ])
for y in range(n_years):
start = y*n_data_per_year
end = start + n_data_per_year
if include_economic_factor:
x['economic_factor'][start:end] = economic_score[y]
x_ = np.random.multivariate_normal(mean=mean, cov=cov, size=n_data_per_year)
x_ = pd.DataFrame(x_,
columns=['age','income','savings','credit_income_ratio',])
beta_params={'age': {'a': 2.0,'b': 5.0},
'income':{'a':2.0, 'b': 2.0},
'savings':{'a':5.0,'b':1.0},
'credit_income_ratio':{'a':0.5,'b': 0.5}}
for c in beta_params.keys():
x[c][start:end] = scipy.stats.beta.ppf(scipy.stats.norm.cdf(x_[c].values), **beta_params[c])
df = pd.DataFrame(x)
zip_code_values = np.random.randint(0,10, size=n_years*n_data_per_year)
one_hot_encoded_zip_code = pd.get_dummies(zip_code_values, prefix='zip_code')
df = df.merge(one_hot_encoded_zip_code, left_index=True, right_index=True)
default_prob = CreditDefaultDataCategoricalFeature._predict(df.values)
df['default_prob'] = default_prob
tmp = np.random.uniform(low=0.0,high=1.0, size=n_years*n_data_per_year)
defaulted = np.zeros((n_years*n_data_per_year,))
defaulted[tmp<default_prob] = 1.0
df['defaulted'] = defaulted
return df
@staticmethod
def _predict( X: np.ndarray)->np.ndarray:
"""This method computes the pd and is called by the method sample.
Args:
df (np.ndarray):
Returns:
np.ndarray: Vector of default probabilities.
"""
age = X[:,Features.Age]
age = 5.0*(1.0-age)*age
credit_income_ratio = -5.0*0.3*X[:, Features.Credit_income_ratio]
x1 = 1.5*(X[:, Features.Income])**2
x2 = 1.5*X[:, Features.Savings]
if X.shape[1]>Features.ZIP_code+9:#check if economic factor is included
x3 = 1.0-X[:, Features.Economimc_factor]
offset=0
else:
x3 = 0.0
offset = -1
x4 = np.zeros((X.shape[0],))
for i in range(10):
x4 += X[:,Features.ZIP_code+i+offset]*CreditDefaultDataCategoricalFeature.zip_code_score[i]
return 1.0/(1.0+np.exp(2.0*(age+credit_income_ratio + x1+x2+x3+x4)))