我们有一个 Uwp 应用程序,它使用 Microsoft 语音来朗读和朗读文本。我注意到应用程序的内存使用量随着每读一段文字而增加,最终会耗尽内存。使用哪种声音或说出什么文本都没有关系。

为了突出显示文本,我订阅了 MediaPlaybackItem 的 TimedMedatataTracks 中的事件。文本说完后,我取消订阅每个事件并处理 MediaPlaybackItem.Source。Visual Studio 内存分析器未显示托管内存中的任何泄漏,因此我怀疑非托管空间中没有清理某些内容。

编辑:我在代码中对此进行了评论,但我会在这里说出来——如果我不订阅 TimedMetadataTrack 事件,泄漏就会消失。我还可以使用Windows 示例应用程序(Synthesize Text with Boundaries)重现这一点

我是否遗漏了需要处理的东西,或者这是 SpeechSynthesizer/MediaPlayer 中的错误?

using System;
using System.Diagnostics;
using Windows.Media.Core;
using Windows.Media.Playback;
using Windows.Media.SpeechSynthesis;

namespace WindowsTts
    public class UwpNativeVoice : IDisposable
        private readonly object _activeSpeechLock;
        private SpeechSynthesizer _synthesizer;
        private MediaPlayer _mediaPlayer;
        private SpeechCallback _activeSpeech;

        public UwpNativeVoice(VoiceInformation platformInfo)
            _activeSpeechLock = new object();

            _synthesizer = new SpeechSynthesizer();
            _synthesizer.Options.IncludeWordBoundaryMetadata = true;
            _synthesizer.Voice = platformInfo;

            _mediaPlayer = new MediaPlayer
                RealTimePlayback = true,
                AutoPlay = false,
                Volume = 1.0f
            _mediaPlayer.MediaOpened += OnMediaPlayerMediaOpened;
            _mediaPlayer.MediaEnded += OnMediaPlayerMediaEnded;

        public void Dispose()
            _mediaPlayer.MediaOpened -= OnMediaPlayerMediaOpened;
            _mediaPlayer.MediaEnded -= OnMediaPlayerMediaEnded;
            (_mediaPlayer.Source as MediaPlaybackItem)?.Source?.Dispose();
            _mediaPlayer.Source = null;
            _mediaPlayer = null;

            _synthesizer = null;

        public async void Speak(string text, SpeechDelegate speechDelegate)
            if ( string.IsNullOrEmpty(text) )
                // no-op; just fire events and bail
                speechDelegate?.Invoke(text, ReadTextEvent.Start);
                speechDelegate?.Invoke(text, ReadTextEvent.End);

            if (_activeSpeech != null)
                // something currently speaking; halt it, fire events and then start anew

            // get synth stream, and add markers for bookmarks & word boundaries
            var synthStream = await _synthesizer.SynthesizeTextToStreamAsync(text);

            lock (_activeSpeechLock)
                _activeSpeech = new SpeechCallback(text, speechDelegate);

                    var source = MediaSource.CreateFromStream(synthStream, synthStream.ContentType);
                    var playbackItem = new MediaPlaybackItem(source);
                    ConfigPlaybackEvents(playbackItem); //Comment this out and the leak goes away
                    _mediaPlayer.Source = playbackItem;
                catch (Exception e)
                    _activeSpeech = null;

        public bool Halt()
            lock (_activeSpeechLock)
                if (_activeSpeech == null)
                    return true;

            DestroyMediaPlaybackItem(_mediaPlayer.Source as MediaPlaybackItem);
            _mediaPlayer.Source = null;

            SpeechCallback callback;
            lock (_activeSpeechLock)
                callback = _activeSpeech;
                _activeSpeech = null;

            return true;

        private void OnMediaPlayerMediaOpened(MediaPlayer sender, object args)

        private void OnTimedMetadataTrackEntered(TimedMetadataTrack track, MediaCueEventArgs args)
            if ( track.TimedMetadataKind == TimedMetadataKind.Speech && args.Cue is SpeechCue speechCue )
                var startIdx = speechCue.StartPositionInInput ?? 0;
                var endIdx = speechCue.EndPositionInInput ?? -1;
                FireReadTextEvent(ReadTextEvent.WordEvent(startIdx, (endIdx - startIdx) + 1));

        private void OnMediaPlayerMediaEnded(MediaPlayer sender, object args)
            SpeechCallback callback;
            lock ( _activeSpeechLock )
                callback = _activeSpeech;
                _activeSpeech = null;

            DestroyMediaPlaybackItem(sender.Source as MediaPlaybackItem);
            sender.Source = null;

        private void FireReadTextEvent(ReadTextEvent evt)
            SpeechCallback callback;
            lock ( _activeSpeechLock )
                callback = _activeSpeech;

        private void ConfigPlaybackEvents(MediaPlaybackItem playbackItem)
            // see: https://docs.microsoft.com/en-us/uwp/api/windows.media.core.timedmetadatatrack

            // iterate through existing tracks, registering callbacks for them
            for ( int i = 0; i < playbackItem.TimedMetadataTracks.Count; i++ )
                RegisterAction(playbackItem, i);

        private void RegisterAction(MediaPlaybackItem item, int idx)
            const string speechWordIdentifier = "SpeechWord";

            TimedMetadataTrack track = item.TimedMetadataTracks[idx];
            if (track.Id.Equals(speechWordIdentifier, StringComparison.Ordinal) || track.Label.Equals(speechWordIdentifier, StringComparison.Ordinal))
                track.CueEntered += OnTimedMetadataTrackEntered;
                item.TimedMetadataTracks.SetPresentationMode((uint)idx, TimedMetadataTrackPresentationMode.ApplicationPresented);

        private void DestroyMediaPlaybackItem(MediaPlaybackItem item)
            if ( item == null )

            foreach ( var track in item.TimedMetadataTracks )
                track.CueEntered -= OnTimedMetadataTrackEntered;


namespace WindowsTts
    /// <summary>Defines a trigger that caused the broadcasting of a ReadTextEvent.</summary>
    public enum ReadTextTrigger

    /// <summary>A ReadTextEvent encompasses the relevant information from the tts world and is passed to the api user as part of a ReadTextInfo's EventAction data. </summary>
    public class ReadTextEvent
        public static ReadTextEvent Start { get; } = new ReadTextEvent()
            Trigger = ReadTextTrigger.Start,
            BookmarkName = null,
            TextOffset = -1,
            TextLength = -1,

        public static ReadTextEvent End { get; } = new ReadTextEvent()
            Trigger = ReadTextTrigger.End,
            BookmarkName = null,
            TextOffset = -1,
            TextLength = -1,

        public ReadTextTrigger Trigger { get; set; }
        public string BookmarkName { get; set; }
        public int TextOffset { get; set; }
        public int TextLength { get; set; }

        /// <summary>Utility methods to pre-initialize some fields of this object.</summary>
        public static ReadTextEvent Factory(ReadTextEvent src)
            return new ReadTextEvent()
                Trigger = src.Trigger,
                BookmarkName = src.BookmarkName,
                TextOffset = src.TextOffset,
                TextLength = src.TextLength,

        public static ReadTextEvent BookmarkEvent(string bookmark)
            return new ReadTextEvent()
                Trigger = ReadTextTrigger.Bookmark,
                BookmarkName = bookmark,
                TextOffset = -1,
                TextLength = -1,

        public static ReadTextEvent WordEvent(int textOffset, int textLength)
            return new ReadTextEvent()
                Trigger = ReadTextTrigger.Word,
                BookmarkName = null,
                TextOffset = textOffset,
                TextLength = textLength,

        private ReadTextEvent()

    /// <summary>
    /// A SpeechDelegate is passed to the ITtsVoice.Speak() method, so that the caller may receive progress info as the text is being spoken.
    /// </summary>
    /// <param name="speechText"></param>
    /// <param name="readTextEvent"></param>
    public delegate void SpeechDelegate(string speechText, ReadTextEvent readTextEvent);

    /// <summary>
    /// This class encapsulates everything necessary to invoke a SpeechDelegate.
    /// A SpeechCallback instance may be created each time a new string is enqueued for speaking,
    /// and then invoked multiple times throughout the process, with an updated ReadTextEvent.
    /// </summary>
    public class SpeechCallback
        private readonly SpeechDelegate _speechDelegate;

        public SpeechCallback(string text, SpeechDelegate speechDelegate)
            Text = text;
            _speechDelegate = speechDelegate;

        public string Text { get; }

        public void Invoke(ReadTextEvent readTextEvent) => _speechDelegate?.Invoke(Text, readTextEvent);

