0

I made a function which can plot statistics for large arrays (10**8) less than 2 seconds. How can I scale Y-axis to make area under the graph equal to 1?

def dis(inp):
  import numpy as np
  import vaex
  import matplotlib.pyplot as plt

  if getattr(inp, "numpy", None) is not None:
    inp1d = np.reshape(inp.numpy(), [-1])
  else:
    inp1d = np.reshape(inp, [-1])

  bin_count = 64
  df = vaex.from_arrays(x=inp1d)
  x_min, x_max = df.minmax(df.x)
  bins = df.count(binby=df.x, shape=bin_count, limits='minmax', edges=True)
  bins[-2] += bins[-1]
  bins[-1] = bins[-2]
  bins = bins[2:]
  hist_height = np.max(bins)
  edges = np.linspace(x_min, x_max, bin_count+1)
  mean = df.mean(df.x)
  std = df.std(df.x)

  for i, v in enumerate([x * std + mean for x in range(-3, 4)]):
    if i == 3:
      plt.plot([v, v], [0, hist_height * 1.02], color='#34A853', linewidth=1)
    else:
      plt.plot([v, v], [0, hist_height * 0.97], color='#34A853', linewidth=0.5)

  plt.step(edges, bins, where='post', color='#4285F4', linewidth=1)
  plt.show()
  print(f'{str(inp.shape) + " " if getattr(inp, "shape", None) is not None and inp.ndim > 1 else ""}{len(inp1d):,}\nmean: {mean}\nstd: {std}\nmin: {x_min}\nmax: {x_max}')

x = np.random.normal(0, 1, (10**8, ))

enter image description here

Complete answer if somebody wants to now how to plot big data statistics:

def dis(inp):
  import numpy as np
  import vaex
  import matplotlib.pyplot as plt

  if getattr(inp, "numpy", None) is not None:
    inp1d = np.reshape(inp.numpy(), [-1])
  else:
    inp1d = np.reshape(inp, [-1])

  bin_count = 64
  df = vaex.from_arrays(x=inp1d)
  x_min, x_max = df.minmax(df.x)
  bins = df.count(binby=df.x, shape=bin_count, limits='minmax', edges=True)
  bins[-2] += bins[-1]
  bins = bins[2:-1]
  edges = np.linspace(x_min, x_max, bin_count+1)
  left, right = edges[:-1], edges[1:]
  edges = np.reshape(np.array([left,right]).T, [-1])
  bins = np.reshape(np.array([bins,bins]).T, [-1])
  mean = df.mean(df.x)
  std = df.std(df.x)

  # Scale AUC to 1
  step = (x_max-x_min)/bin_count
  population = np.sum(bins)
  surface = population*step
  bins = bins/surface
  hist_height = np.max(bins)

  for i, v in enumerate([x * std + mean for x in range(-3, 4)]):
    if i == 3:
      plt.plot([v, v], [0, hist_height * 1.02], color='#34A853', linewidth=1)
    else:
      plt.plot([v, v], [0, hist_height * 0.97], color='#34A853', linewidth=0.5)

  plt.fill_between(edges, bins, step="pre", alpha=0.3)
  plt.plot(edges, bins, color='#4285F4', linewidth=1)
  plt.show()
  print(f'{str(inp.shape) + " " if getattr(inp, "shape", None) is not None and inp.ndim > 1 else ""}{len(inp1d):,}\nmean: {mean}\nstd: {std}\nmin: {x_min}\nmax: {x_max}')

enter image description here

To moderators: this site doesn't alow me to post code even if it is the answer: It looks like your post is mostly code; please add some more details.

4

1 回答 1

0

这个想法是标准化您的数据集,即将每列的高度除以直方图的 AUC(曲线下面积)。

在“plt.step(...)”之前写:

step = (x_max-x_min)/bin_count
population = np.sum(bins)
surface = population*step
bins = bins/surface

希望能有所帮助

于 2019-09-17T17:49:54.160 回答