Source code for rivapy.sample_data.market_data.credit_default

from enum import IntEnum
import numpy as np
import scipy
from scipy import stats
import pandas as pd

class Features(IntEnum):
    Age = 0
    Income = 1
    Savings = 2
    Credit_income_ratio = 3
    Economimc_factor = 4
    ZIP_code = 5


[docs]
class CreditDefaultData:

[docs]
    @staticmethod
    def sample(n_data: int, seed: int=None, constant=-1.0, 
                cov:np.ndarray=None)->pd.DataFrame: 
        """Sample credit default data. 

        Return a pandas DataFrame that contains some credit features together with the default probability
        and an indicator if the default occured (1) or if the credit did not default (0). The data is generated by a logistic regression
        where the pd for a credit is computed by logistic regression (with fixed coefficients). The following features
        are used

        - :math:`x_{\\mbox{age}}` age of lender, sampled from beta distribution (a=2, b=5)
        - :math:`x_{\\mbox{income}}` income of lender, sampled from beta distribution (a=2.0, b=2.0)
        - :math:`x_{\\mbox{savings}}` savings of lender, sampled from beta distribution (a=5.0, b=1.0)
        - :math:`x_{\\mbox{amount}}` amount of credit, sampled from  beta distribution (a=0.5, b=0.5)
        - :math:`x_{\\mbox{region}}` one hot encoded feature indicating  one of three regions the lender lives in. The region are uniformly distributed
        
        The single features (exception is the region that is drawn independently of the other features) are related via a Gaussian copula. The following figure showsthe distributions and pairplots for a 
        generated sample of features.

        .. image:: ../../../docs/source/figs/credit_default_features.png
            :align: center
            :width: 400
  
        After the features have been generated, logistic regression is used to compute default probabilities (pd) via the formula
        
        .. math::
            pd = \\frac{1}{1+e^{x_{\\mbox{age}}}

        Args:
            n_data (int): Number of data sampled (number of rows of final DataFrame).
            seed (int, optional): The seed used internally, if None, no seed will be set. Defaults to None.
            constant (float, optional): Constant used in logistic regression that determines the overall level of the pd. Defaults to -1.0.
            cov (np.ndarray, optional): Covariance matrix used in the Gaussian copula. Defaults to None (thena flat covariance of 0.95 is used).

        Returns:
            pd.DataFrame: DataFrame with features, default probabilities and default indicator.
        """
        if seed is not None:
            np.random.seed(seed)
        if cov is None:
            cov = np.array([[1.0,0.95,0.95,0.95],
                [0.95,1.0,0.95,0.95],
                [0.95,0.95,1.0,0.95],
                [0.95,0.95,0.95,1.0],])
        mean = np.array([0.0,0.0,0.0,0.0])
        x_ = np.random.multivariate_normal(mean=mean, cov=cov, size=n_data)
        x_ = pd.DataFrame(x_, columns=['age','income','savings','credit_amount_perc'])
        beta_params={'age': {'a': 2.0,'b': 5.0}, 
                     'income':{'a':2.0, 'b': 2.0}, 
                     'savings':{'a':5.0,'b':1.0}, 
                     'credit_amount_perc':{'a':0.5,'b': 0.5}}
        for c in x_.columns:
            x_[c] = scipy.stats.beta.ppf(scipy.stats.norm.cdf(x_[c].values), **beta_params[c])
        result = x_



        #result = {}
        #result['age'] = np.random.beta(a=2.0,b=5.0, size=n_data)
        #result['income'] = np.random.beta(a=2.0, b=2.0, size=n_data)
        #result['savings'] = np.random.uniform(low=0.0, high=1.0)
        #result['credit_amount_perc'] = np.random.uniform(low=0.5,high=5.0, size=n_data)
        
        region = [None]*3
        for i in range(len(region)):
            region[i] = np.zeros((n_data,))
        _tmp = np.random.randint(low=0, high=len(region), size=n_data)
        for i in range(_tmp.shape[0]):
            region[_tmp[i]][i] = 1
        for i in range(len(region)):
            result['region_'+str(i)]=region[i]
        result['regime'] = constant
        df = pd.DataFrame(result)
        
        
        default_prob = CreditDefaultData._predict(df)
        df['default_prob'] = default_prob
        tmp = np.random.uniform(low=0.0,high=1.0, size=n_data)
        defaulted = np.zeros((n_data,))
        defaulted[tmp<default_prob] = 1.0
        df['defaulted'] = defaulted
        return df

    
    
    @staticmethod
    def _predict(df: pd.DataFrame)->np.ndarray:
        """This method computes the pd and is called by the method sample.

        Args:
            df (pd.DataFrame): Pandas DataFrame as constructed within the sample method.

        Returns:
            np.ndarray: Vector of default probabilities.
        """
        beta0 = [0.5, 1.1, 0.7]
        tmp =  (0.5-df['age'])**2 - 0.3*df['credit_amount_perc']#
        x0 = 5.0*(0.5 * df['region_0'] + 1.1 * df['region_1'] + 0.7*df['region_2'])*tmp
        x1 = 1.5*(df['income'])**2
        x2 = 1.5*df['savings']
        x3 = df['regime']
        return 1.0/(1.0+np.exp(2.0*(x0+x1+x2+x3)))

    

class CreditDefaultData2:
        
    @staticmethod
    def sample(n_years: int, n_data_per_year: int, seed: int=None,
                cov:np.ndarray=None, 
                include_economic_factor: bool=True)->pd.DataFrame: 
        """Sample credit default data. 

        Return a pandas DataFrame that contains some credit features together with the default probability
        and an indicator if the default occured (1) or if the credit did not default (0). The data is generated by a logistic regression
        where the pd for a credit is computed by logistic regression (with fixed coefficients). The following features
        are used

        - :math:`x_{\\mbox{age}}` age of lender, sampled from beta distribution (a=2, b=5)
        - :math:`x_{\\mbox{income}}` income of lender, sampled from beta distribution (a=2.0, b=2.0)
        - :math:`x_{\\mbox{savings}}` savings of lender, sampled from beta distribution (a=5.0, b=1.0)
        - :math:`x_{\\mbox{amount}}` amount of credit, sampled from  beta distribution (a=0.5, b=0.5)
        - :math:`x_{\\mbox{economic}}` economic factor, uniformly sampled from [0,0.5] for each year but one of the samples will be set to 1.0

        The single features are related via a Gaussian copula. The following figure shows the distributions and pairplots for a 
        generated sample of features.

        .. image:: ../../../docs/source/figs/credit_default_features.png
            :align: center
            :width: 400
  
        After the features have been generated, logistic regression is used to compute default probabilities (pd) via the formula
        
        .. math::
            pd = \\frac{1}{1+e^{x_{\\mbox{age}}}

        Args:
            n_data (int): Number of data sampled (number of rows of final DataFrame).
            seed (int, optional): The seed used internally, if None, no seed will be set. Defaults to None.
            constant (float, optional): Constant used in logistic regression that determines the overall level of the pd. Defaults to -1.0.
            cov (np.ndarray, optional): Covariance matrix used in the Gaussian copula. Defaults to None (thena flat covariance of 0.95 is used).

        Returns:
            pd.DataFrame: DataFrame with features, default probabilities and default indicator.
        """
        if seed is not None:
            np.random.seed(seed)
        if cov is None:
            cov = np.array([[1.0,0.95,0.95,0.95],
                [0.95,1.0,0.95,0.95],
                [0.95,0.95,1.0,0.95],
                [0.95,0.95,0.95,1.0],])
        mean = np.array([0.0,0.0,0.0,0.0])
        if include_economic_factor:
            economic_score = np.random.uniform(0, 0.5, size=n_years)
            economic_score[-1] = 1.0
        if include_economic_factor:
            x = pd.DataFrame(np.empty((n_years*n_data_per_year, 5)), 
                            columns=['age','income','savings','credit_income_ratio' , 'economic_factor'])
        else:
            x = pd.DataFrame(np.empty((n_years*n_data_per_year, 4)), 
                            columns=['age','income','savings','credit_income_ratio'])
        
        for y in range(n_years):
            start = y*n_data_per_year
            end = start + n_data_per_year
            if include_economic_factor:
                x['economic_factor'][start:end] = economic_score[y]
            x_ = np.random.multivariate_normal(mean=mean, cov=cov, size=n_data_per_year)
            x_ = pd.DataFrame(x_, 
                          columns=['age','income','savings','credit_income_ratio',])
            beta_params={'age': {'a': 2.0,'b': 5.0}, 
                        'income':{'a':2.0, 'b': 2.0}, 
                        'savings':{'a':5.0,'b':1.0}, 
                        'credit_income_ratio':{'a':0.5,'b': 0.5}}
            for c in beta_params.keys():
                x[c][start:end] = scipy.stats.beta.ppf(scipy.stats.norm.cdf(x_[c].values), **beta_params[c])
        df = pd.DataFrame(x)
        
        default_prob = CreditDefaultData2._predict(df.values)
        df['default_prob'] = default_prob
        tmp = np.random.uniform(low=0.0,high=1.0, size=n_years*n_data_per_year)
        defaulted = np.zeros((n_years*n_data_per_year,))
        defaulted[tmp<default_prob] = 1.0
        df['defaulted'] = defaulted
        return df
    
    
    @staticmethod
    def _predict(X: np.ndarray)->np.ndarray:
        """This method computes the pd and is called by the method sample.

        Args:
            df (np.ndarray): 

        Returns:
            np.ndarray: Vector of default probabilities.
        """
        
        age = X[:,Features.Age]
        age = 5.0*(1.0-age)*age
        credit_income_ratio = -5.0*0.3*X[:, Features.Credit_income_ratio]
        x1 = 1.5*(X[:, Features.Income])**2
        x2 = 1.5*X[:, Features.Savings]
        if X.shape[1] == 5: # if the given data does not contain the economic factor, set it to 1.0
            x3 = 1.0-X[:, Features.Economimc_factor]
        else:
            x3 = 0.0
        return 1.0/(1.0+np.exp(2.0*(age+credit_income_ratio + x1+x2+x3)))
    


class CreditDefaultDataCategoricalFeature:
    zip_code_score = [0.05, 0.13, 0.275, 0.59, 0.42, 0.05,0.87,0.05, 0.21, 0.69]
    @staticmethod
    def sample(n_years: int, n_data_per_year: int, seed: int=None,
                cov:np.ndarray=None, include_economic_factor: bool=True)->pd.DataFrame: 
        """Sample credit default data. 

        Return a pandas DataFrame that contains some credit features together with the default probability
        and an indicator if the default occured (1) or if the credit did not default (0). The data is generated by a logistic regression
        where the pd for a credit is computed by logistic regression (with fixed coefficients). The following features
        are used

        - :math:`x_{\\mbox{age}}` age of lender, sampled from beta distribution (a=2, b=5)
        - :math:`x_{\\mbox{income}}` income of lender, sampled from beta distribution (a=2.0, b=2.0)
        - :math:`x_{\\mbox{savings}}` savings of lender, sampled from beta distribution (a=5.0, b=1.0)
        - :math:`x_{\\mbox{amount}}` amount of credit, sampled from  beta distribution (a=0.5, b=0.5)
        - :math:`x_{\\mbox{economic}}` economic factor, uniformly sampled from [0,0.5] but one of the samples will be set to 1.0
        - :math: _{\\mbox{zip}}` zip code of lender, sampled from categorical distribution (with 10 categories). Each zipcode 

        The single features (except the zip code) are related via a Gaussian copula. The following figure showsthe distributions and pairplots for a 
        generated sample of features.

        .. image:: ../../../docs/source/figs/credit_default_features.png
            :align: center
            :width: 400
  
        After the features have been generated, logistic regression is used to compute default probabilities (pd) via the formula
        
        .. math::
            pd = \\frac{1}{1+e^{x_{\\mbox{age}}}

        Args:
            n_data (int): Number of data sampled (number of rows of final DataFrame).
            seed (int, optional): The seed used internally, if None, no seed will be set. Defaults to None.
            constant (float, optional): Constant used in logistic regression that determines the overall level of the pd. Defaults to -1.0.
            cov (np.ndarray, optional): Covariance matrix used in the Gaussian copula. Defaults to None (thena flat covariance of 0.95 is used).

        Returns:
            pd.DataFrame: DataFrame with features, default probabilities and default indicator.
        """
        if seed is not None:
            np.random.seed(seed)
        if cov is None:
            cov = np.array([[1.0,0.95,0.95,0.95],
                [0.95,1.0,0.95,0.95],
                [0.95,0.95,1.0,0.95],
                [0.95,0.95,0.95,1.0],])
        mean = np.array([0.0,0.0,0.0,0.0])
        if include_economic_factor:
            economic_score = np.random.uniform(0, 0.5, size=n_years)
            economic_score[-1] = 1.0
            x = pd.DataFrame(np.empty((n_years*n_data_per_year, 5)), 
                            columns=['age','income','savings','credit_income_ratio' , 'economic_factor'])
        else:
            x = pd.DataFrame(np.empty((n_years*n_data_per_year, 4)), 
                            columns=['age','income','savings','credit_income_ratio' ])
            
        for y in range(n_years):
            start = y*n_data_per_year
            end = start + n_data_per_year
            if include_economic_factor:
                x['economic_factor'][start:end] = economic_score[y]
            x_ = np.random.multivariate_normal(mean=mean, cov=cov, size=n_data_per_year)
            x_ = pd.DataFrame(x_, 
                          columns=['age','income','savings','credit_income_ratio',])
            beta_params={'age': {'a': 2.0,'b': 5.0}, 
                        'income':{'a':2.0, 'b': 2.0}, 
                        'savings':{'a':5.0,'b':1.0}, 
                        'credit_income_ratio':{'a':0.5,'b': 0.5}}
            for c in beta_params.keys():
                x[c][start:end] = scipy.stats.beta.ppf(scipy.stats.norm.cdf(x_[c].values), **beta_params[c])
        df = pd.DataFrame(x)
        zip_code_values = np.random.randint(0,10, size=n_years*n_data_per_year)
        
        one_hot_encoded_zip_code = pd.get_dummies(zip_code_values, prefix='zip_code')
        df = df.merge(one_hot_encoded_zip_code, left_index=True, right_index=True)
        default_prob = CreditDefaultDataCategoricalFeature._predict(df.values)
        df['default_prob'] = default_prob
        tmp = np.random.uniform(low=0.0,high=1.0, size=n_years*n_data_per_year)
        defaulted = np.zeros((n_years*n_data_per_year,))
        defaulted[tmp<default_prob] = 1.0
        df['defaulted'] = defaulted
        return df
    
    @staticmethod
    def _predict( X: np.ndarray)->np.ndarray:
        """This method computes the pd and is called by the method sample.

        Args:
            df (np.ndarray): 

        Returns:
            np.ndarray: Vector of default probabilities.
        """
        age = X[:,Features.Age]
        age = 5.0*(1.0-age)*age
        credit_income_ratio = -5.0*0.3*X[:, Features.Credit_income_ratio]
        x1 = 1.5*(X[:, Features.Income])**2
        x2 = 1.5*X[:, Features.Savings]
        if X.shape[1]>Features.ZIP_code+9:#check if economic factor is included
            x3 = 1.0-X[:, Features.Economimc_factor]
            offset=0
        else:
            x3 = 0.0
            offset = -1
        x4 = np.zeros((X.shape[0],))
        for i in range(10):
            x4 += X[:,Features.ZIP_code+i+offset]*CreditDefaultDataCategoricalFeature.zip_code_score[i]
        return 1.0/(1.0+np.exp(2.0*(age+credit_income_ratio + x1+x2+x3+x4)))