1

我想要一种有效的算法来查找更大序列中所有出现的模式。

例如,给定以下输入:
模式: GAS
序列: ASDFGASDFGASDFADFASDFGA

预期输出: {4, 9}

根据接受的类似问题的答案,实现了实现所需任务的算法。但是,一条评论报告该算法“在大字节数组上很慢”。

在阅读之后,看起来最好的算法是Boyer-Moore 字符串搜索算法,它在CodeProject上的 C# 中实现,但我在为通用枚举实现它时遇到了麻烦。

是否有任何基于 Boyer-Moore 算法的现有解决方案来查找.NET中通用序列中模式的所有出现?

注意
虽然我在示例中使用了字符串,但我想要一个适用于任何实现 IEnumerable 的数据的答案。换句话说,它不仅应该适用于字符串,而且应该适用于任何类型。

4

2 回答 2

3

当序列是模式的重复并且模式是另一个模式重复 m 次(如果我错了,请纠正我)时,最坏情况的性能是 O(nm)(其中 n = seq.Count)。

List<int> LookFor<T>( IEnumerable<T> seq, T[ ] pattern )
        where T : IEquatable<T> {

    var partialMatches = new LinkedList<int>( );
    var matches = new List<int>( );

    int i = 0;
    foreach ( T item in seq ) {
        if ( item.Equals( pattern[ 0 ] ) )
            partialMatches.AddLast( 0 );

        var n = partialMatches.First;
        while(n != null) {
            if ( item.Equals( pattern[ n.Value ] ) ) {
                n.Value += 1;
                if ( n.Value == pattern.Length ) {
                    matches.Add( i - pattern.Length + 1 );

                    var next = n.Next;
                    partialMatches.Remove( n );
                    n = next;

                    continue;
                }
            }
            else partialMatches.Remove( n );

            n = n.Next;
        }

        i += 1;
    }

    return matches;
}

测试:

void Main()
{
    var matches = LookFor( "abcabcabcabcabcabcabc", 
        new char[ ] { 'a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c' } );

    foreach ( var x in matches )
        Console.WriteLine( "{0}", x );
}

输出:

0
3
6
9
12
于 2012-09-26T14:43:14.233 回答
0

在徒劳地理解 Boyer-Moore 算法之后,我将这段代码放在一起,它通过一次遍历更大的集合来进行模式匹配。

我无法针对 Boyer-Moore 算法对其进行测试,但它的工作效率非常高,当整个序列是模式的重复时, O(nm)是最坏情况下的性能。

这是我的实现。让我知道你对此的看法。

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace ConsoleApplication1
{
    class Program
    {
        static void Main(string[] args)
        {
            Console.WriteLine("Enter the string you want to search within.");
            string hayStack = Console.ReadLine();
            Console.WriteLine("Enter the string you want to search for.");
            string needle = Console.ReadLine();

            var ps = new PatternSearch<char>(needle.ToCharArray());

            Console.WriteLine();
            Console.WriteLine();

            Console.WriteLine(hayStack);

            var matches = ps.Matches(hayStack.ToCharArray()).ToList();

            for (int i = 0; i < hayStack.Length; i++)
                Console.Write(matches.Contains(i) ? "↑" : " ");

            Console.WriteLine();

            Console.ReadLine();
        }
    }

    /// <summary>Implements a pattern searching algorithm with <b>O(nm)</b> worst-case performance.</summary>
    /// <typeparam name="T">The data type of the array to search.</typeparam>
    public class PatternSearch<T>
    {
        private struct MatchInfo
        {
            public MatchInfo(int startIndex, int matchLength)
            {
                this.StartIndex = startIndex;
                this.MatchLength = matchLength;
            }
            public int StartIndex;
            public int MatchLength;
        }

        private IEnumerable<T> pattern;
        private List<MatchInfo> found;
        private Func<T, T, bool> eqComp;

        //optimization for IEnumerables that do not implement IList
        int patLen = -1;
        int seqLen = -1;

        /// <summary>Initializes a new instance of the <see cref="PatternSearch{T}" /> class.</summary>
        /// <param name="pattern">The pattern that will be searched for.</param>
        public PatternSearch(T[] pattern) : this(pattern, (x, y) => x.Equals(y)) { }

        /// <summary>
        /// Initializes a new instance of the <see cref="PatternSearch{T}"/> class with the specified equality comparer.
        /// </summary>
        /// <param name="pattern">The pattern to be searched for.</param>
        /// <param name="equalityComparer">The equality comparer to use for matching elements in the array.</param>
        public PatternSearch(T[] pattern, Func<T, T, bool> equalityComparer)
        {
            patLen = pattern.Length;

            if (pattern == null)
                throw new ArgumentNullException("pattern", "The search pattern cannot be null.");
            if (equalityComparer == null)
                throw new ArgumentNullException("equalityComparer", "The equality comparer cannot be null.");

            if (patLen <= 0)
                throw new ArgumentException("pattern", "The pattern cannot be empty.");

            // assign the values
            this.pattern = pattern;
            found = new List<MatchInfo>();
            eqComp = equalityComparer;
        }

        /// <summary>
        /// Returns the start index of all occurrences of the search pattern within the specified array.
        /// </summary>
        /// <param name="seq">The larger sequence to find occurrences of the search pattern within.</param>
        public IEnumerable<int> Matches(IEnumerable<T> seq)
        {
            seqLen = seqLen == -1 ? seq.Count() : seqLen;
            return this.Matches(seq, 0, seqLen);
        }

        /// <summary>
        /// Returns the start index of all occurrences of the search pattern within the specified array.
        /// </summary>
        /// <param name="seq">The larger sequence to find occurrences of the search pattern within.</param>
        /// <param name="startIndex">The index in <paramref name="seq"/> to start searching at.</param>
        public IEnumerable<int> Matches(IEnumerable<T> seq, int startIndex)
        {
            seqLen = seqLen == -1 ? seq.Count() : seqLen;
            return this.Matches(seq, startIndex, seqLen);
        }

        /// <summary>
        /// Returns the start index of all occurrences of the search pattern within the specified array.
        /// </summary>
        /// <param name="seq">The larger sequence to find occurrences of the search pattern within.</param>
        /// <param name="count">The maximum number of items in <paramref name="seq"/> to match.</param>
        public IEnumerable<int> Matches(IEnumerable<T> seq, int startIndex, int count)
        {
            patLen = patLen == -1 ? pattern.Count() : patLen;
            seqLen = seqLen == -1 ? seq.Count() : seqLen;
            bool addedNew = false;

            var endPoint = Math.Min(seqLen, startIndex + count);

            if (seq == null ||                      // sequence cannot be null
                seqLen < patLen ||                  // pattern cannot be longer than sequence
                (endPoint - startIndex) < patLen)   // start to end cannot be less than pattern
                yield break;

            for (int i = startIndex; i < endPoint; i++)
            {
                addedNew = false;

                // add the first item if a match is found
                if (eqComp(seq.ElementAt(i), pattern.ElementAt(0)))
                {
                    if (patLen == 1)
                        yield return i;

                    found.Add(new MatchInfo(i, 1));
                    addedNew = true;
                }

                // check incomplete matches
                for (int m = found.Count - 1; m >= 0; m--)
                {
                    //skip the last item added
                    if (addedNew && m == found.Count - 1)
                        continue;

                    var match = found[m];

                    // check incomplete matches
                    if ((i - match.StartIndex < patLen) &&
                        eqComp(seq.ElementAt(i), pattern.ElementAt(match.MatchLength)))
                    {
                        match.MatchLength += 1;
                        found[m] = match;

                        // determine if a complete match has been found
                        if (match.MatchLength == patLen)
                        {
                            yield return match.StartIndex;
                            found.RemoveAt(m);
                        }
                    }
                    else
                        found.RemoveAt(m);
                }
            }
        }

    }
}
于 2012-09-26T14:28:47.400 回答