Creating synthetic ordinal data¶

[1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from jmspack.NLTSA import flatten

[2]:

# column_amount = 10
# answer_options = [1, 2, 3, 4, 5]
# row_amount = 50
# answer_probabilities = [0.4, 0.1, 0.1, 0.1, 0.3]

# df = pd.DataFrame()
# for column_num in range(0, column_amount):
#     np.random.seed(69420 + column_num)
#     uniform_array = np.random.choice(
#          a=answer_options,
#          size=row_amount,
#          p=answer_probabilities
#     )
#     current_df = pd.DataFrame({f"likert_{len(answer_options)}_options_{column_num}":uniform_array})
#     df = pd.concat([df, current_df], axis=1)

[3]:

def ordinal_data_creation(column_amount,
                          answer_options,
                          row_amount,
                          answer_probabilities):

    df = pd.DataFrame()
    for column_num in range(0, column_amount):
        np.random.seed(69420 + column_num)

        if answer_probabilities is None:
            uniform_array = np.random.choice(
                 a=answer_options,
                 size=row_amount,
                 p=flatten(np.random.dirichlet(np.ones(len(answer_options)),size=1).tolist())
            )

            print({f"likert_{len(answer_options)}_options_{column_num}": flatten(np.random.dirichlet(np.ones(len(answer_options)),size=1).tolist())})

        else:
            uniform_array = np.random.choice(
                 a=answer_options,
                 size=row_amount,
                 p=answer_probabilities
            )
            print({f"likert_{len(answer_options)}_options_{column_num}": answer_probabilities})


        current_df = pd.DataFrame({f"likert_{len(answer_options)}_options_{column_num}":uniform_array})
        df = pd.concat([df, current_df], axis=1)
    return df

[4]:

row_amount = 2000

df_1 = ordinal_data_creation(column_amount = 30,
                        answer_options = [1, 2, 3, 4, 5],
                        row_amount = row_amount,
                        answer_probabilities = None)

df_2 = ordinal_data_creation(column_amount = 10,
                        answer_options = [1, 2, 3, 4, 5],
                        row_amount = row_amount,
                        answer_probabilities = [0.4, 0.1, 0.1, 0.1, 0.3])
df_2.columns = [x[:-1] + str(int(x[-1]) + 30) for x in df_2.columns.tolist()]

df_3 = ordinal_data_creation(column_amount = 10,
                        answer_options = [1, 2, 3, 4, 5, 6, 7],
                        row_amount = row_amount,
                        answer_probabilities = None)

df_4 = ordinal_data_creation(column_amount = 10,
                        answer_options = [0, 1],
                        row_amount = row_amount,
                        answer_probabilities = [0.9, 0.1])

bool_df = ordinal_data_creation(column_amount = 1,
                        answer_options = [0, 1],
                        row_amount = row_amount,
                        answer_probabilities = [0.6, 0.4])
bool_df.columns = ["target"]

df = pd.concat([df_1, df_2, df_3, df_4, bool_df], axis=1)

{'likert_5_options_0': [0.3260497316019787, 0.17021080937036878, 0.20238572809340846, 0.1398947366106401, 0.161458994323604]}
{'likert_5_options_1': [0.08777919292218628, 0.11191711990196725, 0.18016252873285118, 0.16844854627309122, 0.451692612169904]}
{'likert_5_options_2': [0.49617221548341156, 0.18379184363308368, 0.08760265629706548, 0.13114380297375605, 0.10128948161268306]}
{'likert_5_options_3': [0.15041858482432638, 0.029361316457183314, 0.08469127721920586, 0.250929064299109, 0.4845997572001755]}
{'likert_5_options_4': [0.36942123568391816, 0.1309521144166391, 0.2848324249275291, 0.0655132232116965, 0.14928100176021722]}
{'likert_5_options_5': [0.1380782844001991, 0.2292352905787918, 0.1981735388766408, 0.06811444737903176, 0.3663984387653367]}
{'likert_5_options_6': [0.012044480134060658, 0.35340651273441764, 0.19850441118878964, 0.1932497693009291, 0.242794826641803]}
{'likert_5_options_7': [0.13228973044665338, 0.017301326305110536, 0.1351938215960546, 0.15865722964735782, 0.5565578920048236]}
{'likert_5_options_8': [0.2698073421453759, 0.05445708561248983, 0.06204225418965961, 0.2794578926464866, 0.33423542540598805]}
{'likert_5_options_9': [0.22096209876409983, 0.3683702355202157, 0.21923184530656803, 0.08826180074593339, 0.1031740196631831]}
{'likert_5_options_10': [0.03445579213251957, 0.18057639545131451, 0.40157480972500653, 0.1476217929368393, 0.23577120975431992]}
{'likert_5_options_11': [0.7148096888129006, 0.10989507530562942, 0.010073028008272233, 0.13498396016658315, 0.030238247706614672]}
{'likert_5_options_12': [0.12123685754213585, 0.24736155229396636, 0.32053081160376956, 0.02605395835861544, 0.2848168202015126]}
{'likert_5_options_13': [0.21984349797749753, 0.09678757891178506, 0.4042007039150064, 0.16740572884710056, 0.11176249034861045]}
{'likert_5_options_14': [0.02509355235514322, 0.13753165226911512, 0.46603295214465384, 0.044905108355807175, 0.3264367348752807]}
{'likert_5_options_15': [0.2855871540139263, 0.20894890559295617, 0.20402737729593975, 0.08020158701072012, 0.2212349760864576]}
{'likert_5_options_16': [0.016859528519665778, 0.27730946617877567, 0.032919197992800894, 0.009050903877528254, 0.6638609034312295]}
{'likert_5_options_17': [0.2203553898310582, 0.19043715106320505, 0.26846055756337334, 0.07500563545665113, 0.24574126608571228]}
{'likert_5_options_18': [0.0338898063660841, 0.03364472110024661, 0.006649480265914805, 0.3285978302988614, 0.597218161968893]}
{'likert_5_options_19': [0.2649540455530784, 0.041275797296955075, 0.3580082957095305, 0.11183640185947781, 0.22392545958095827]}
{'likert_5_options_20': [0.13533684852703093, 0.11582961199752453, 0.21316226372491828, 0.32778518803935697, 0.20788608771116934]}
{'likert_5_options_21': [0.05640742595207831, 0.1920758931744514, 0.40038028091433736, 0.2537056583615812, 0.09743074159755184]}
{'likert_5_options_22': [0.28443685227821247, 0.5421179617737621, 0.0048147752207973755, 0.050354861112876105, 0.11827554961435177]}
{'likert_5_options_23': [0.054459358300603576, 0.16520101455555808, 0.10167329249009253, 0.6622693855374544, 0.016396949116291387]}
{'likert_5_options_24': [0.3550352259713607, 0.2010220837605364, 0.046551471467427864, 0.34737583144762874, 0.05001538735304646]}
{'likert_5_options_25': [0.12711980345735233, 0.07975061150979235, 0.2579434097195905, 0.3354937634925402, 0.1996924118207248]}
{'likert_5_options_26': [0.16897560466843864, 0.2144783582676875, 0.19510556117269978, 0.26335436079387153, 0.15808611509730258]}
{'likert_5_options_27': [0.5000281488694938, 0.11227324105677823, 0.13223729504290851, 0.11134772090231362, 0.1441135941285058]}
{'likert_5_options_28': [0.6288642666666908, 0.2911760749369517, 0.048267228567051734, 0.009332037774234072, 0.022360392055071808]}
{'likert_5_options_29': [0.3060951893799885, 0.10047492821004562, 0.05119456177898828, 0.16546396248255338, 0.3767713581484242]}
{'likert_5_options_0': [0.4, 0.1, 0.1, 0.1, 0.3]}
{'likert_5_options_1': [0.4, 0.1, 0.1, 0.1, 0.3]}
{'likert_5_options_2': [0.4, 0.1, 0.1, 0.1, 0.3]}
{'likert_5_options_3': [0.4, 0.1, 0.1, 0.1, 0.3]}
{'likert_5_options_4': [0.4, 0.1, 0.1, 0.1, 0.3]}
{'likert_5_options_5': [0.4, 0.1, 0.1, 0.1, 0.3]}
{'likert_5_options_6': [0.4, 0.1, 0.1, 0.1, 0.3]}
{'likert_5_options_7': [0.4, 0.1, 0.1, 0.1, 0.3]}
{'likert_5_options_8': [0.4, 0.1, 0.1, 0.1, 0.3]}
{'likert_5_options_9': [0.4, 0.1, 0.1, 0.1, 0.3]}
{'likert_7_options_0': [0.1028642990063549, 0.07110271140013663, 0.08206293213374803, 0.08160071375445849, 0.2988614359954494, 0.14820552899603343, 0.21530237871381908]}
{'likert_7_options_1': [0.08090668982320186, 0.07564622000113605, 0.20284436683530271, 0.023302508402303915, 0.20249740220187804, 0.246759572881609, 0.16804323985456854]}
{'likert_7_options_2': [0.10168244281641418, 0.15222166553244953, 0.11756905963056842, 0.2015269559575192, 0.2521656104960905, 0.16866089363112086, 0.006173371935837414]}
{'likert_7_options_3': [0.0578691522706486, 0.17145865203416133, 0.3311247398849803, 0.029926162829100576, 0.2493314940534007, 0.13656296974397455, 0.023726829183734033]}
{'likert_7_options_4': [0.23675439571638918, 0.05445497849082502, 0.12408325131055528, 0.19856504660459984, 0.03032861912119273, 0.06316563373163855, 0.29264807502479945]}
{'likert_7_options_5': [0.21386868717561103, 0.07350904425092684, 0.39541683277258505, 0.2057930008345625, 0.0077290947846081325, 0.06297157931145612, 0.04071176087025043]}
{'likert_7_options_6': [0.15644642602492062, 0.15230510776164646, 0.19135284026169488, 0.20262313806708032, 0.1463303540024974, 0.13234715165476776, 0.018594982227392608]}
{'likert_7_options_7': [0.12250488693405796, 0.1437660815395402, 0.5043208397202252, 0.019363228491196465, 0.002086193364440244, 0.06298139986216192, 0.1449773700883779]}
{'likert_7_options_8': [0.030409087188572513, 0.1369721254331551, 0.16382051757194668, 0.14078351304461942, 0.011748444865206209, 0.11471043208046906, 0.4015558798160309]}
{'likert_7_options_9': [0.30552449063496173, 0.12300284968963812, 0.1437847214225458, 0.21252388424569238, 0.01323052904302421, 0.04479524257702839, 0.1571382823871094]}
{'likert_2_options_0': [0.9, 0.1]}
{'likert_2_options_1': [0.9, 0.1]}
{'likert_2_options_2': [0.9, 0.1]}
{'likert_2_options_3': [0.9, 0.1]}
{'likert_2_options_4': [0.9, 0.1]}
{'likert_2_options_5': [0.9, 0.1]}
{'likert_2_options_6': [0.9, 0.1]}
{'likert_2_options_7': [0.9, 0.1]}
{'likert_2_options_8': [0.9, 0.1]}
{'likert_2_options_9': [0.9, 0.1]}
{'likert_2_options_0': [0.6, 0.4]}

[5]:

df.head()

[5]:

	likert_5_options_0	likert_5_options_1	likert_5_options_2	likert_5_options_3	likert_5_options_4	likert_5_options_5	likert_5_options_6	likert_5_options_7	likert_5_options_8	likert_5_options_9	...	likert_2_options_2	likert_2_options_3	likert_2_options_4	likert_2_options_5	likert_2_options_8
0	1	4	2	1	2	3	3	5	5	3	...	0	0	0	0	0
1	3	1	2	5	1	4	4	5	1	2	...	1	0	1	0	0
2	2	3	3	1	1	3	3	5	4	4	...	0	0	0	1	0
3	5	2	2	5	2	1	3	2	5	3	...	0	0	0	0	0
4	3	4	2	5	2	4	3	5	4	1	...	0	1	1	0	1

5 rows × 61 columns

Shuffle the target so there are no variables which have a perfect classification¶

[6]:

_ = sns.scatterplot(x=df.index, y=df["likert_5_options_30"], hue=df["target"])

[7]:

df.loc[0:int(row_amount/2), "target"] = df.loc[0:int(row_amount/2), "target"].sample(frac=1, random_state=69420).reset_index(drop=True)

[8]:

df = df.sample(frac=1, random_state=69420).reset_index(drop=True)

[9]:

_ = sns.scatterplot(x=df.index, y=df["likert_5_options_30"], hue=df["target"])

[10]:

df.head()

[10]:

	likert_5_options_0	likert_5_options_1	likert_5_options_2	likert_5_options_3	likert_5_options_4	likert_5_options_5	likert_5_options_6	likert_5_options_7	likert_5_options_8	likert_5_options_9	...	likert_2_options_1	likert_2_options_4	likert_2_options_7	target
0	5	1	2	5	5	3	3	5	5	2	...	1	0	0	0
1	1	5	5	1	5	3	3	5	5	2	...	1	0	0	0
2	5	4	2	5	2	4	3	5	1	3	...	0	0	0	1
3	5	4	3	5	5	3	5	4	1	2	...	0	0	1	0
4	5	5	3	5	2	3	3	5	1	3	...	0	1	0	0

5 rows × 61 columns

[11]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 61 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   likert_5_options_0   2000 non-null   int64
 1   likert_5_options_1   2000 non-null   int64
 2   likert_5_options_2   2000 non-null   int64
 3   likert_5_options_3   2000 non-null   int64
 4   likert_5_options_4   2000 non-null   int64
 5   likert_5_options_5   2000 non-null   int64
 6   likert_5_options_6   2000 non-null   int64
 7   likert_5_options_7   2000 non-null   int64
 8   likert_5_options_8   2000 non-null   int64
 9   likert_5_options_9   2000 non-null   int64
 10  likert_5_options_10  2000 non-null   int64
 11  likert_5_options_11  2000 non-null   int64
 12  likert_5_options_12  2000 non-null   int64
 13  likert_5_options_13  2000 non-null   int64
 14  likert_5_options_14  2000 non-null   int64
 15  likert_5_options_15  2000 non-null   int64
 16  likert_5_options_16  2000 non-null   int64
 17  likert_5_options_17  2000 non-null   int64
 18  likert_5_options_18  2000 non-null   int64
 19  likert_5_options_19  2000 non-null   int64
 20  likert_5_options_20  2000 non-null   int64
 21  likert_5_options_21  2000 non-null   int64
 22  likert_5_options_22  2000 non-null   int64
 23  likert_5_options_23  2000 non-null   int64
 24  likert_5_options_24  2000 non-null   int64
 25  likert_5_options_25  2000 non-null   int64
 26  likert_5_options_26  2000 non-null   int64
 27  likert_5_options_27  2000 non-null   int64
 28  likert_5_options_28  2000 non-null   int64
 29  likert_5_options_29  2000 non-null   int64
 30  likert_5_options_30  2000 non-null   int64
 31  likert_5_options_31  2000 non-null   int64
 32  likert_5_options_32  2000 non-null   int64
 33  likert_5_options_33  2000 non-null   int64
 34  likert_5_options_34  2000 non-null   int64
 35  likert_5_options_35  2000 non-null   int64
 36  likert_5_options_36  2000 non-null   int64
 37  likert_5_options_37  2000 non-null   int64
 38  likert_5_options_38  2000 non-null   int64
 39  likert_5_options_39  2000 non-null   int64
 40  likert_7_options_0   2000 non-null   int64
 41  likert_7_options_1   2000 non-null   int64
 42  likert_7_options_2   2000 non-null   int64
 43  likert_7_options_3   2000 non-null   int64
 44  likert_7_options_4   2000 non-null   int64
 45  likert_7_options_5   2000 non-null   int64
 46  likert_7_options_6   2000 non-null   int64
 47  likert_7_options_7   2000 non-null   int64
 48  likert_7_options_8   2000 non-null   int64
 49  likert_7_options_9   2000 non-null   int64
 50  likert_2_options_0   2000 non-null   int64
 51  likert_2_options_1   2000 non-null   int64
 52  likert_2_options_2   2000 non-null   int64
 53  likert_2_options_3   2000 non-null   int64
 54  likert_2_options_4   2000 non-null   int64
 55  likert_2_options_5   2000 non-null   int64
 56  likert_2_options_6   2000 non-null   int64
 57  likert_2_options_7   2000 non-null   int64
 58  likert_2_options_8   2000 non-null   int64
 59  likert_2_options_9   2000 non-null   int64
 60  target               2000 non-null   int64
dtypes: int64(61)
memory usage: 953.2 KB

[12]:

features_list = df.drop("target", axis=1).columns.tolist()

[13]:

fig, axs = plt.subplots(figsize=(20,100), nrows=len(features_list), ncols=2, gridspec_kw={'width_ratios': [2, 1]})
fig.subplots_adjust(hspace = 0.5, wspace=0.1)
axs = axs.ravel()
for i in range(0, len(features_list)*2, 2):
    _ = sns.histplot(data=df, x=features_list[int(i/2)], hue="target", kde=False, bins=10, ax=axs[i])
    _ = sns.boxplot(data=df, x="target", y=features_list[int(i/2)], ax=axs[i+1])

[14]:

_ = df.to_csv("data/synthetic_likert_data.csv")

	likert_5_options_0	likert_5_options_1	likert_5_options_2	likert_5_options_3	likert_5_options_4	likert_5_options_5	likert_5_options_6	likert_5_options_7	likert_5_options_8	likert_5_options_9	...	likert_2_options_2	likert_2_options_3	likert_2_options_4	likert_2_options_5	likert_2_options_8
0	1	4	2	1	2	3	3	5	5	3	...	0	0	0	0	0
1	3	1	2	5	1	4	4	5	1	2	...	1	0	1	0	0
2	2	3	3	1	1	3	3	5	4	4	...	0	0	0	1	0
3	5	2	2	5	2	1	3	2	5	3	...	0	0	0	0	0
4	3	4	2	5	2	4	3	5	4	1	...	0	1	1	0	1

	likert_5_options_0	likert_5_options_1	likert_5_options_2	likert_5_options_3	likert_5_options_4	likert_5_options_5	likert_5_options_6	likert_5_options_7	likert_5_options_8	likert_5_options_9	...	likert_2_options_1	likert_2_options_4	likert_2_options_7	target
0	5	1	2	5	5	3	3	5	5	2	...	1	0	0	0
1	1	5	5	1	5	3	3	5	5	2	...	1	0	0	0
2	5	4	2	5	2	4	3	5	1	3	...	0	0	0	1
3	5	4	3	5	5	3	5	4	1	2	...	0	0	1	0
4	5	5	3	5	2	3	3	5	1	3	...	0	1	0	0

	likert_5_options_0	likert_5_options_1	likert_5_options_2	likert_5_options_3	likert_5_options_4	likert_5_options_5	likert_5_options_6	likert_5_options_7	likert_5_options_8	likert_5_options_9	...	likert_2_options_2	likert_2_options_3	likert_2_options_4	likert_2_options_5	likert_2_options_8
0	1	4	2	1	2	3	3	5	5	3	...	0	0	0	0	0
1	3	1	2	5	1	4	4	5	1	2	...	1	0	1	0	0
2	2	3	3	1	1	3	3	5	4	4	...	0	0	0	1	0
3	5	2	2	5	2	1	3	2	5	3	...	0	0	0	0	0
4	3	4	2	5	2	4	3	5	4	1	...	0	1	1	0	1

	likert_5_options_0	likert_5_options_1	likert_5_options_2	likert_5_options_3	likert_5_options_4	likert_5_options_5	likert_5_options_6	likert_5_options_7	likert_5_options_8	likert_5_options_9	...	likert_2_options_1	likert_2_options_4	likert_2_options_7	target
0	5	1	2	5	5	3	3	5	5	2	...	1	0	0	0
1	1	5	5	1	5	3	3	5	5	2	...	1	0	0	0
2	5	4	2	5	2	4	3	5	1	3	...	0	0	0	1
3	5	4	3	5	5	3	5	4	1	2	...	0	0	1	0
4	5	5	3	5	2	3	3	5	1	3	...	0	1	0	0

	likert_5_options_0	likert_5_options_1	likert_5_options_2	likert_5_options_3	likert_5_options_4	likert_5_options_5	likert_5_options_6	likert_5_options_7	likert_5_options_8	likert_5_options_9	...	likert_2_options_2	likert_2_options_3	likert_2_options_4	likert_2_options_5	likert_2_options_8
0	1	4	2	1	2	3	3	5	5	3	...	0	0	0	0	0
1	3	1	2	5	1	4	4	5	1	2	...	1	0	1	0	0
2	2	3	3	1	1	3	3	5	4	4	...	0	0	0	1	0
3	5	2	2	5	2	1	3	2	5	3	...	0	0	0	0	0
4	3	4	2	5	2	4	3	5	4	1	...	0	1	1	0	1

	likert_5_options_0	likert_5_options_1	likert_5_options_2	likert_5_options_3	likert_5_options_4	likert_5_options_5	likert_5_options_6	likert_5_options_7	likert_5_options_8	likert_5_options_9	...	likert_2_options_1	likert_2_options_4	likert_2_options_7	target
0	5	1	2	5	5	3	3	5	5	2	...	1	0	0	0
1	1	5	5	1	5	3	3	5	5	2	...	1	0	0	0
2	5	4	2	5	2	4	3	5	1	3	...	0	0	0	1
3	5	4	3	5	5	3	5	4	1	2	...	0	0	1	0
4	5	5	3	5	2	3	3	5	1	3	...	0	1	0	0