python - 绘制歌曲中每个独特声音循环的时间范围，使用 python Librosa 按声音相似度对行进行排序

Question

背景

这是电子歌曲中歌曲剪辑的视频。在视频的开头，歌曲全速播放。当您放慢歌曲速度时，您可以听到歌曲使用的所有独特声音。其中一些声音重复。

视频中音频的 Mp3、Wav 和 MIDI

问题描述

我要做的是创建如下所示的视觉效果，其中为每个独特的声音创建一个水平轨道/行，该轨道上的彩色块对应于声音正在播放的歌曲中的每个时间帧。音轨/行应按声音与每个声音的相似程度排序，更相似的声音靠得更近。如果声音是如此相同以至于人类无法区分它们，那么它们应该被认为是相同的声音。

如果它通常可以满足我的要求，我会接受一个不完美的解决方案

观看上面链接的视频，了解我所说的视频描述。它包括一个我手动创建的视觉网格，它几乎与我尝试生成的网格相匹配。

例如，如果下面的 5 个波中的每一个都代表一个声音发出的声波，那么这些声音中的每一个都将被认为是相似的，并且将在网格上垂直放置彼此靠近。

尝试

我一直在寻找 librosa 中的拉普拉斯分割示例。标记为结构组件的图形看起来可能是我需要的。从阅读论文来看，他们似乎正试图将歌曲分成合唱、诗句、桥牌等片段……但我实际上是在试图将歌曲分成 1 或 2 个节拍片段。

这是拉普拉斯分割的代码（如果您愿意，也可以使用Jupyter Notebook ）。

# -*- coding: utf-8 -*-
"""
======================
Laplacian segmentation
======================

This notebook implements the laplacian segmentation method of
`McFee and Ellis, 2014 <http://bmcfee.github.io/papers/ismir2014_spectral.pdf>`_,
with a couple of minor stability improvements.

Throughout the example, we will refer to equations in the paper by number, so it will be
helpful to read along.
"""

# Code source: Brian McFee
# License: ISC


###################################
# Imports
#   - numpy for basic functionality
#   - scipy for graph Laplacian
#   - matplotlib for visualization
#   - sklearn.cluster for K-Means
#
import numpy as np
import scipy
import matplotlib.pyplot as plt

import sklearn.cluster

import librosa
import librosa.display
import matplotlib.patches as patches

#############################
# First, we'll load in a song
def laplacianSegmentation(fileName):
    y, sr = librosa.load(librosa.ex('fishin'))


    ##############################################
    # Next, we'll compute and plot a log-power CQT
    BINS_PER_OCTAVE = 12 * 3
    N_OCTAVES = 7
    C = librosa.amplitude_to_db(np.abs(librosa.cqt(y=y, sr=sr,
                                            bins_per_octave=BINS_PER_OCTAVE,
                                            n_bins=N_OCTAVES * BINS_PER_OCTAVE)),
                                ref=np.max)

    fig, ax = plt.subplots()
    librosa.display.specshow(C, y_axis='cqt_hz', sr=sr,
                            bins_per_octave=BINS_PER_OCTAVE,
                            x_axis='time', ax=ax)


    ##########################################################
    # To reduce dimensionality, we'll beat-synchronous the CQT
    tempo, beats = librosa.beat.beat_track(y=y, sr=sr, trim=False)
    Csync = librosa.util.sync(C, beats, aggregate=np.median)

    # For plotting purposes, we'll need the timing of the beats
    # we fix_frames to include non-beat frames 0 and C.shape[1] (final frame)
    beat_times = librosa.frames_to_time(librosa.util.fix_frames(beats,
                                                                x_min=0,
                                                                x_max=C.shape[1]),
                                        sr=sr)

    fig, ax = plt.subplots()
    librosa.display.specshow(Csync, bins_per_octave=12*3,
                            y_axis='cqt_hz', x_axis='time',
                            x_coords=beat_times, ax=ax)


    #####################################################################
    # Let's build a weighted recurrence matrix using beat-synchronous CQT
    # (Equation 1)
    # width=3 prevents links within the same bar
    # mode='affinity' here implements S_rep (after Eq. 8)
    R = librosa.segment.recurrence_matrix(Csync, width=3, mode='affinity',
                                        sym=True)

    # Enhance diagonals with a median filter (Equation 2)
    df = librosa.segment.timelag_filter(scipy.ndimage.median_filter)
    Rf = df(R, size=(1, 7))


    ###################################################################
    # Now let's build the sequence matrix (S_loc) using mfcc-similarity
    #
    #   :math:`R_\text{path}[i, i\pm 1] = \exp(-\|C_i - C_{i\pm 1}\|^2 / \sigma^2)`
    #
    # Here, we take :math:`\sigma` to be the median distance between successive beats.
    #
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    Msync = librosa.util.sync(mfcc, beats)

    path_distance = np.sum(np.diff(Msync, axis=1)**2, axis=0)
    sigma = np.median(path_distance)
    path_sim = np.exp(-path_distance / sigma)

    R_path = np.diag(path_sim, k=1) + np.diag(path_sim, k=-1)


    ##########################################################
    # And compute the balanced combination (Equations 6, 7, 9)

    deg_path = np.sum(R_path, axis=1)
    deg_rec = np.sum(Rf, axis=1)

    mu = deg_path.dot(deg_path + deg_rec) / np.sum((deg_path + deg_rec)**2)

    A = mu * Rf + (1 - mu) * R_path


    ###########################################################
    # Plot the resulting graphs (Figure 1, left and center)
    fig, ax = plt.subplots(ncols=3, sharex=True, sharey=True, figsize=(10, 4))
    librosa.display.specshow(Rf, cmap='inferno_r', y_axis='time', x_axis='s',
                            y_coords=beat_times, x_coords=beat_times, ax=ax[0])
    ax[0].set(title='Recurrence similarity')
    ax[0].label_outer()
    librosa.display.specshow(R_path, cmap='inferno_r', y_axis='time', x_axis='s',
                            y_coords=beat_times, x_coords=beat_times, ax=ax[1])
    ax[1].set(title='Path similarity')
    ax[1].label_outer()
    librosa.display.specshow(A, cmap='inferno_r', y_axis='time', x_axis='s',
                            y_coords=beat_times, x_coords=beat_times, ax=ax[2])
    ax[2].set(title='Combined graph')
    ax[2].label_outer()


    #####################################################
    # Now let's compute the normalized Laplacian (Eq. 10)
    L = scipy.sparse.csgraph.laplacian(A, normed=True)


    # and its spectral decomposition
    evals, evecs = scipy.linalg.eigh(L)


    # We can clean this up further with a median filter.
    # This can help smooth over small discontinuities
    evecs = scipy.ndimage.median_filter(evecs, size=(9, 1))


    # cumulative normalization is needed for symmetric normalize laplacian eigenvectors
    Cnorm = np.cumsum(evecs**2, axis=1)**0.5

    # If we want k clusters, use the first k normalized eigenvectors.
    # Fun exercise: see how the segmentation changes as you vary k

    k = 5

    X = evecs[:, :k] / Cnorm[:, k-1:k]


    # Plot the resulting representation (Figure 1, center and right)

    fig, ax = plt.subplots(ncols=2, sharey=True, figsize=(10, 5))
    librosa.display.specshow(Rf, cmap='inferno_r', y_axis='time', x_axis='time',
                            y_coords=beat_times, x_coords=beat_times, ax=ax[1])
    ax[1].set(title='Recurrence similarity')
    ax[1].label_outer()

    librosa.display.specshow(X,
                            y_axis='time',
                            y_coords=beat_times, ax=ax[0])
    ax[0].set(title='Structure components')


    #############################################################
    # Let's use these k components to cluster beats into segments
    # (Algorithm 1)
    KM = sklearn.cluster.KMeans(n_clusters=k)

    seg_ids = KM.fit_predict(X)


    # and plot the results
    fig, ax = plt.subplots(ncols=3, sharey=True, figsize=(10, 4))
    colors = plt.get_cmap('Paired', k)

    librosa.display.specshow(Rf, cmap='inferno_r', y_axis='time',
                            y_coords=beat_times, ax=ax[1])
    ax[1].set(title='Recurrence matrix')
    ax[1].label_outer()

    librosa.display.specshow(X,
                            y_axis='time',
                            y_coords=beat_times, ax=ax[0])
    ax[0].set(title='Structure components')

    img = librosa.display.specshow(np.atleast_2d(seg_ids).T, cmap=colors,
                            y_axis='time', y_coords=beat_times, ax=ax[2])
    ax[2].set(title='Estimated segments')
    ax[2].label_outer()
    fig.colorbar(img, ax=[ax[2]], ticks=range(k))


    ###############################################################
    # Locate segment boundaries from the label sequence
    bound_beats = 1 + np.flatnonzero(seg_ids[:-1] != seg_ids[1:])

    # Count beat 0 as a boundary
    bound_beats = librosa.util.fix_frames(bound_beats, x_min=0)

    # Compute the segment label for each boundary
    bound_segs = list(seg_ids[bound_beats])

    # Convert beat indices to frames
    bound_frames = beats[bound_beats]

    # Make sure we cover to the end of the track
    bound_frames = librosa.util.fix_frames(bound_frames,
                                        x_min=None,
                                        x_max=C.shape[1]-1)

    ###################################################
    # And plot the final segmentation over original CQT


    # sphinx_gallery_thumbnail_number = 5

    bound_times = librosa.frames_to_time(bound_frames)
    freqs = librosa.cqt_frequencies(n_bins=C.shape[0],
                                    fmin=librosa.note_to_hz('C1'),
                                    bins_per_octave=BINS_PER_OCTAVE)

    fig, ax = plt.subplots()
    librosa.display.specshow(C, y_axis='cqt_hz', sr=sr,
                            bins_per_octave=BINS_PER_OCTAVE,
                            x_axis='time', ax=ax)

    for interval, label in zip(zip(bound_times, bound_times[1:]), bound_segs):
        ax.add_patch(patches.Rectangle((interval[0], freqs[0]),
                                    interval[1] - interval[0],
                                    freqs[-1],
                                    facecolor=colors(label),
                                    alpha=0.50))

我认为必须改变的一件主要事情是集群的数量，在示例中它们有 5 个，但我不知道我想要它是什么，因为我不知道有多少声音。我将它设置为 400 产生以下结果，这并不是我可以使用的东西。理想情况下，我希望所有块都是纯色：而不是介于最大红色和蓝色值之间的颜色。

（我把它横着看更像我上面的例子，更像我试图产生的输出）

附加信息

背景中可能还有鼓音轨，有时会同时播放多个声音。如果这些多个声音组被解释为一个独特的声音，那没关系，但我显然更喜欢它们是否可以被区分为单独的声音。

如果它更容易，您可以使用删除鼓循环

y, sr = librosa.load(librosa.ex('exampleSong.mp3'))
y_harmonic, y_percussive = librosa.effects.hpss(y)

更新

我能够通过瞬变分离声音。目前这种作品，但是它分成了太多的声音，据我所知，它似乎主要只是将一些声音分成两个。我还可以从我正在使用的软件创建一个 midi 文件，并使用它来确定瞬态时间，但如果可以的话，我想在没有 midi 文件的情况下解决这个问题。midi 文件非常准确，将声音文件分成 33 个部分，而瞬态代码将声音文件分成 40 个部分。这是midi的可视化

所以仍然需要解决的部分将是

更好的瞬态分离
对声音进行排序

score 3 · Accepted Answer

下面是一个在梅尔谱图上使用非负矩阵分解 (NMF) 来分解输入音频的脚本。我花了你上传的音频 WAV 的完整音频的第一秒，然后运行代码以获得以下输出。代码和音频剪辑都可以在 Github存储库中找到。

当 BPM 已知（在给定的示例中似乎约为 130）并且输入音频大致与节拍对齐时，这种方法似乎在短音频剪辑上做得相当合理。不能保证它在整首歌曲或其他歌曲上也能正常工作。

有很多方法可以改进：

使用比 mel-spectrogram 更紧凑和感知的向量作为 NMF。可能是从音乐中学到的转变。嵌入自动编码器。
将 NMF 组件重复数据删除为“主要”组件。
向 NMF 添加约束，例如时间。那里有很多研究论文
自动检测 BPM 并进行对齐
更好的感知排序。可能想要组，例如和弦、单音、打击乐

import os.path
import sys

import librosa
import pandas
import numpy
import sklearn.decomposition
import skimage.color

from matplotlib import pyplot as plt
import librosa.display
import seaborn



def decompose_audio(y, sr, bpm, per_beat=8,
                    n_components=16, n_mels=128, fmin=100, fmax=6000):
    """
    Decompose audio using NMF spectrogram decomposition,
    using a fixed number of frames per beat (@per_beat) for a given @bpm
    NOTE: assumes audio to be aligned to the beat
    """
    
    interval = (60/bpm)/per_beat
    T = sklearn.decomposition.NMF(n_components)
    S = numpy.abs(librosa.feature.melspectrogram(y, hop_length=int(sr*interval), n_mels=128, fmin=100, fmax=6000))
    
    comps, acts = librosa.decompose.decompose(S, transformer=T, sort=False)
    
    # compute feature to sort components by
    ind = numpy.apply_along_axis(numpy.argmax, 0, comps)
    #ind = librosa.feature.spectral_rolloff(S=comps)[0]
    #ind = librosa.feature.spectral_centroid(S=comps)[0]

    # apply sorting
    order_idx = numpy.argsort(ind)
    ordered_comps = comps[:,order_idx]
    ordered_acts = acts[order_idx,:]
    
    # plot components
    librosa.display.specshow(librosa.amplitude_to_db(ordered_comps,
                                                  ref=numpy.max),y_axis='mel', sr=sr)
    
    return S, ordered_comps, ordered_acts



def plot_colorized_activations(acts, ax, hop_length=None, sr=None, value_mod=1.0):

    hsv = numpy.stack([
        numpy.ones(shape=acts.shape),
        numpy.ones(shape=acts.shape),
        acts,
    ], axis=-1)

    # Set hue based on a palette
    colors = seaborn.color_palette("husl", hsv.shape[0])
    for row_no in range(hsv.shape[0]):
        c = colors[row_no]
        c = skimage.color.rgb2hsv(numpy.stack([c]))[0]
        hsv[row_no, :, 0] = c[0]
        hsv[row_no, :, 1] = c[1]
        hsv[row_no, :, 2] *= value_mod

    colored = skimage.color.hsv2rgb(hsv)
    
    # use same kind of order as librosa.specshow
    flipped = colored[::-1, :, :]

    ax.imshow(flipped)
    ax.set(aspect='auto')
    
    ax.tick_params(axis='x',
        which='both',
        bottom=False,
        top=False,
        labelbottom=False)
    
    ax.tick_params(axis='both',
        which='both',
        bottom=False,
        left=False,
        top=False,
        labelbottom=False)
    

def plot_activations(S, acts):
    fig, ax = plt.subplots(nrows=4, ncols=1, figsize=(25, 15), sharex=False)
    
    # spectrogram
    db = librosa.amplitude_to_db(S, ref=numpy.max)
    librosa.display.specshow(db, ax=ax[0], y_axis='mel')

    # original activations
    librosa.display.specshow(acts, x_axis='time', ax=ax[1])

    # colorize
    plot_colorized_activations(acts, ax=ax[2], value_mod=3.0)

    # thresholded
    q = numpy.quantile(acts, 0.90, axis=0, keepdims=True) + 1e-9
    norm = acts / q
    threshold = numpy.quantile(norm, 0.93)
    plot_colorized_activations((norm > threshold).astype(float), ax=ax[3], value_mod=1.0)
    return fig

def main():
    audio_file = 'silence-end.wav'
    audio_bpm = 130
    sr = 22050
    audio, sr = librosa.load(audio_file, sr=sr)
    S, comps, acts = decompose_audio(y=audio, sr=sr, bpm=audio_bpm)
    fig = plot_activations(S, acts)
    fig.savefig('plot.png', transparent=False)

main()

python - 绘制歌曲中每个独特声音循环的时间范围，使用 python Librosa 按声音相似度对行进行排序

1 回答 1

Related

Reference