I made a function which can plot statistics for large arrays (10**8)
less than 2 seconds. How can I scale Y-axis
to make area under the graph equal to 1?
def dis(inp):
import numpy as np
import vaex
import matplotlib.pyplot as plt
if getattr(inp, "numpy", None) is not None:
inp1d = np.reshape(inp.numpy(), [-1])
else:
inp1d = np.reshape(inp, [-1])
bin_count = 64
df = vaex.from_arrays(x=inp1d)
x_min, x_max = df.minmax(df.x)
bins = df.count(binby=df.x, shape=bin_count, limits='minmax', edges=True)
bins[-2] += bins[-1]
bins[-1] = bins[-2]
bins = bins[2:]
hist_height = np.max(bins)
edges = np.linspace(x_min, x_max, bin_count+1)
mean = df.mean(df.x)
std = df.std(df.x)
for i, v in enumerate([x * std + mean for x in range(-3, 4)]):
if i == 3:
plt.plot([v, v], [0, hist_height * 1.02], color='#34A853', linewidth=1)
else:
plt.plot([v, v], [0, hist_height * 0.97], color='#34A853', linewidth=0.5)
plt.step(edges, bins, where='post', color='#4285F4', linewidth=1)
plt.show()
print(f'{str(inp.shape) + " " if getattr(inp, "shape", None) is not None and inp.ndim > 1 else ""}{len(inp1d):,}\nmean: {mean}\nstd: {std}\nmin: {x_min}\nmax: {x_max}')
x = np.random.normal(0, 1, (10**8, ))
Complete answer if somebody wants to now how to plot big data statistics:
def dis(inp):
import numpy as np
import vaex
import matplotlib.pyplot as plt
if getattr(inp, "numpy", None) is not None:
inp1d = np.reshape(inp.numpy(), [-1])
else:
inp1d = np.reshape(inp, [-1])
bin_count = 64
df = vaex.from_arrays(x=inp1d)
x_min, x_max = df.minmax(df.x)
bins = df.count(binby=df.x, shape=bin_count, limits='minmax', edges=True)
bins[-2] += bins[-1]
bins = bins[2:-1]
edges = np.linspace(x_min, x_max, bin_count+1)
left, right = edges[:-1], edges[1:]
edges = np.reshape(np.array([left,right]).T, [-1])
bins = np.reshape(np.array([bins,bins]).T, [-1])
mean = df.mean(df.x)
std = df.std(df.x)
# Scale AUC to 1
step = (x_max-x_min)/bin_count
population = np.sum(bins)
surface = population*step
bins = bins/surface
hist_height = np.max(bins)
for i, v in enumerate([x * std + mean for x in range(-3, 4)]):
if i == 3:
plt.plot([v, v], [0, hist_height * 1.02], color='#34A853', linewidth=1)
else:
plt.plot([v, v], [0, hist_height * 0.97], color='#34A853', linewidth=0.5)
plt.fill_between(edges, bins, step="pre", alpha=0.3)
plt.plot(edges, bins, color='#4285F4', linewidth=1)
plt.show()
print(f'{str(inp.shape) + " " if getattr(inp, "shape", None) is not None and inp.ndim > 1 else ""}{len(inp1d):,}\nmean: {mean}\nstd: {std}\nmin: {x_min}\nmax: {x_max}')
To moderators: this site doesn't alow me to post code even if it is the answer: It looks like your post is mostly code; please add some more details.