0

我正在尝试使用 UWP 的 AudioGraph API 来重现合成语音和简短通知声音(“earcons”)的混合。

UWP 有一个语音合成 API,它为我提供了一个包含 WAV 文件的流,但我不想对参数(比特率、样本深度等)做出太多假设,所以我的想法是拥有一个AudioSubmixNode并添加AudioFrameInputNodes每当有一些演讲要重现。将单独的话语排队以使它们不重叠存在一些复杂性。

该图被初始化为

    private async Task InitAudioGraph()
    {
        var graphCreated = await AudioGraph.CreateAsync(new AudioGraphSettings(Windows.Media.Render.AudioRenderCategory.Speech)
        {
            QuantumSizeSelectionMode = QuantumSizeSelectionMode.LowestLatency
        });
        if (graphCreated.Status != AudioGraphCreationStatus.Success) return;

        _Graph = graphCreated.Graph;
        var outputCreated = await _Graph.CreateDeviceOutputNodeAsync();
        if (outputCreated.Status != AudioDeviceNodeCreationStatus.Success) return;

        _Mixer = _Graph.CreateSubmixNode();
        _Mixer.AddOutgoingConnection(outputCreated.DeviceOutputNode);

        _Graph.Start();
    }

然后播放当前的话语

class SpeechStreamPlayer : IDisposable
{
    internal static void Play(AudioGraph graph, AudioSubmixNode mixer, SpeechSynthesisStream speechStream)
    {
        if (!speechStream.ContentType.Equals("audio/wav", StringComparison.OrdinalIgnoreCase)) throw new NotSupportedException("Content type: " + speechStream.ContentType);

        var stream = speechStream.AsStreamForRead();

        // Read the RIFF header
        uint chunkId = stream.ReadUint(); // "RIFF" - but in little-endian
        if (chunkId != 0x46464952) throw new NotSupportedException("Magic: " + chunkId);
        uint chunkSize = stream.ReadUint(); // Length of rest of stream
        uint format = stream.ReadUint(); // "WAVE"
        if (format != 0x45564157) throw new NotSupportedException("Stream format: " + format);

        // "fmt " sub-chunk
        uint subchunkId = stream.ReadUint();
        if (subchunkId != 0x20746d66) throw new NotSupportedException("Expected fmt sub-chunk, found " + subchunkId);
        uint subchunkSize = stream.ReadUint();
        uint subchunk2Off = (uint)stream.Position + subchunkSize;
        uint audioFormat = (uint)stream.ReadShort();
        uint chans = (uint)stream.ReadShort();
        uint sampleRate = stream.ReadUint();
        uint byteRate = stream.ReadUint();
        uint blockSize = (uint)stream.ReadShort();
        uint bitsPerSample = (uint)stream.ReadShort();

        // Possibly extra stuff added, so...
        stream.Seek(subchunk2Off, SeekOrigin.Begin);

        subchunkId = stream.ReadUint(); // "data"
        if (subchunkId != 0x61746164) throw new NotSupportedException("Expected data sub-chunk, found " + subchunkId);
        subchunkSize = stream.ReadUint();

        // Ok, the stream is in the correct place to start extracting data and we have the parameters.
        var props = AudioEncodingProperties.CreatePcm(sampleRate, chans, bitsPerSample);

        var frameInputNode = graph.CreateFrameInputNode(props);
        frameInputNode.AddOutgoingConnection(mixer);

        new SpeechStreamPlayer(frameInputNode, mixer, stream, blockSize);
    }

    internal event EventHandler StreamFinished;

    private SpeechStreamPlayer(AudioFrameInputNode frameInputNode, AudioSubmixNode mixer, Stream stream, uint sampleSize)
    {
        _FrameInputNode = frameInputNode;
        _Mixer = mixer;
        _Stream = stream;
        _SampleSize = sampleSize;

        _FrameInputNode.QuantumStarted += Source_QuantumStarted;
        _FrameInputNode.Start();
    }

    private AudioFrameInputNode _FrameInputNode;
    private AudioSubmixNode _Mixer;
    private Stream _Stream;
    private readonly uint _SampleSize;

    private unsafe void Source_QuantumStarted(AudioFrameInputNode sender, FrameInputNodeQuantumStartedEventArgs args)
    {
        if (args.RequiredSamples <= 0) return;
        System.Diagnostics.Debug.WriteLine("Requested {0} samples", args.RequiredSamples);

        var frame = new AudioFrame((uint)args.RequiredSamples * _SampleSize);
        using (var buffer = frame.LockBuffer(AudioBufferAccessMode.Write))
        {
            using (var reference = buffer.CreateReference())
            {
                byte* pBuffer;
                uint capacityBytes;

                var directBuffer = reference as IMemoryBufferByteAccess;
                ((IMemoryBufferByteAccess)reference).GetBuffer(out pBuffer, out capacityBytes);

                uint bytesRemaining = (uint)_Stream.Length - (uint)_Stream.Position;
                uint bytesToCopy = Math.Min(capacityBytes, bytesRemaining);

                for (uint i = 0; i < bytesToCopy; i++) pBuffer[i] = (byte)_Stream.ReadByte();
                for (uint i = bytesToCopy; i < capacityBytes; i++) pBuffer[i] = 0;

                if (bytesRemaining <= capacityBytes)
                {
                    Dispose();
                    StreamFinished?.Invoke(this, EventArgs.Empty);
                }
            }
        }

        sender.AddFrame(frame);
    }

    public void Dispose()
    {
        if (_FrameInputNode != null)
        {
            _FrameInputNode.QuantumStarted -= Source_QuantumStarted;
            _FrameInputNode.Dispose();
            _FrameInputNode = null;
        }

        if (_Stream != null)
        {
            _Stream.Dispose();
            _Stream = null;
        }
    }
}

这工作一次。当第一个话语结束时,StreamFinished?.Invoke(this, EventArgs.Empty);通知队列管理系统应该播放下一个话语,并且线路

    var frameInputNode = graph.CreateFrameInputNode(props);

抛出一个Exceptionwith 消息Exception from HRESULT: 0x88960001。一点挖掘表明它对应于 XAUDIO2_E_INVALID_CALL,但这不是很有描述性。

在这两种情况下,传递给的参数AudioEncodingProperties.CreatePcm都是(22050, 1, 16).

我怎样才能找到有关出了什么问题的更多详细信息?在最坏的情况下,我想我可以把整个图表扔掉,每次都建立一个新的,但这似乎效率很低。

4

1 回答 1

0

问题似乎在

当第一个话语结束时,StreamFinished?.Invoke(this, EventArgs.Empty);通知队列管理系统应该播放下一个话语

尽管AudioFrameInputNode.QuantumStarted的文档没有说明禁止的操作,但AudioGraph.QuantumStarted的文档说

QuantumStarted 事件是同步的,这意味着您无法在此事件的处理程序中更新 AudioGraph 或单个音频节点的属性或状态。尝试执行诸如停止音频图或添加、删除或启动单个音频节点之类的操作将导致抛出异常。

看来这也适用于节点的QuantumStarted事件。

简单的解决方案是将图形操作移动到另一个线程

                        Task.Run(() => StreamFinished?.Invoke(this, EventArgs.Empty));
于 2017-02-09T10:57:02.363 回答