import numpy as np import matplotlib.pyplot as plt from scipy import stats from sklearn.neighbors import KernelDensity np.random.seed(1) #set random seed so you get same data n_reps = 10000 #perform 10000 times ps_null_transformed = np.zeros(n_reps) ps_diff_raw = np.zeros(n_reps) ps_dif_transformed = np.zeros(n_reps) data = np.array([0.9390851, 0.9666265, 0.7483506, 0.9564947, 0.243594, 0.9384128, 0.3984024, 0.5806361, 0.9077033, 0.9272811, 0.9013306, 0.9253298, 0.9048479, 0.9450448, 0.8598598, 0.8911161, 0.8828563,0.9820026,0.9097131,0.9449918,0.9246619,0.9933545,0.8525506,0.791416,0.8806227,0.8839348,0.8120107,0.6639317,0.9337438,0.9624416,0.9370267,0.9568143,0.9711389,0.9332489,0.9356305,0.9160964,0.6688652,0.9664063,0.4913768]).reshape(-1,1) x = np.linspace(-1, 2, 100).reshape(-1,1) kde = KernelDensity(kernel='gaussian', bandwidth=0.05).fit(data) for n in range(n_reps): sample1 = np.power(100, kde.sample(10)) sample2 = np.power(100, kde.sample(10)) (t, ps_null_transformed[n]) = stats.ttest_ind(sample1, sample2) for n in range(n_reps): sample1 = kde.sample(10) sample2 = np.sqrt(kde.sample(10)) (t, ps_diff_raw[n]) = stats.ttest_ind(sample1, sample2) for n in range(n_reps): sample1 = np.power(100,kde.sample(10)) sample2 = np.power(100,np.sqrt(kde.sample(10))) (t, ps_dif_transformed[n]) = stats.ttest_ind(sample1, sample2) fig, ax = plt.subplots(3, 2, figsize=(10,12)) plt.subplots_adjust(hspace = 0.3) #plot the distribution out data was sampled from weights = np.ones(10000)/float(10000) ax[0,0].hist(np.power(100, kde.sample(10000)), bins = 50, weights = weights) ax[0,0].set_xlabel('Sample value') ax[0,0].set_ylabel('Probability') ax[0,0].set_title('Transformed Population Distribution') plt.sca(ax[0,1]) weights = np.ones_like(ps_null_transformed)/float(len(ps_null_transformed)) ax[0,1].hist(ps_null_transformed, bins = 20, weights = weights) plt.xlabel('P Value') plt.ylabel('Probability') plt.title('P < 0.05 ' + str(np.sum(ps_null_transformed<0.05)/float(len(ps_null_transformed))*100) + '% of the time') ax[1,0].hist(kde.sample(10000), weights = weights, alpha = 0.5, bins = np.linspace(0,1.2,100), label="Sample1") ax[1,0].hist(np.sqrt(kde.sample(10000)), weights = weights, alpha = 0.5, bins = np.linspace(0,1.2,100), label="Sample2") ax[1,0].set_xlabel('Sample value') ax[1,0].set_ylabel('Probability') ax[1,0].legend(prop={'size': 10}) ax[1,0].set_title('Raw Population Distribution') plt.sca(ax[1,1]) ax[1,1].hist(ps_diff_raw, bins = 20, weights = weights) plt.xlabel('P Value') plt.ylabel('Probability') plt.title('P < 0.05 ' + str(np.sum(ps_diff_raw<0.05)/float(len(ps_diff_raw))*100) + '% of the time') ax[2,0].hist(np.power(100,kde.sample(10000)), weights = weights, alpha = 0.5, bins = np.linspace(0,200,100), label="Sample1") ax[2,0].hist(np.power(100,np.sqrt(kde.sample(10000))), weights = weights, alpha = 0.5, bins = np.linspace(0,200,100), label="Sample2") ax[2,0].set_xlabel('Sample value') ax[2,0].set_ylabel('Probability') ax[2,0].legend(prop={'size': 10}) ax[2,0].set_title('Transformed Population Distribution') plt.sca(ax[2,1]) ax[2,1].hist(ps_dif_transformed, bins = 20, weights = weights) plt.xlabel('P Value') plt.ylabel('Probability') plt.yticks(np.linspace(0, 0.18,10)) plt.title('P < 0.05 ' + str(np.sum(ps_dif_transformed<0.05)/float(len(ps_dif_transformed))*100) + '% of the time')