c# - 使用 C# 的数组算法的字典排序数组

Question

我知道我在处理 vb6 之前已经问过这种问题，而且它太慢了，所以我决定使用 C# 来完成这项工作；现在相同的代码以两倍的速度运行，但仍然太慢了。

它慢的原因是它从每列的末尾开始按字典顺序排序，检查所有行。

我相信会加快这一进程的是，如果我从第一列开始排序过程，检查所有行并按该列的第一个字节检测最低行，并可能检测具有相同第一个低字节的多行并将它们分组以进行下一步它检查第二个（下一个）列，如果它们都相同，则检查第二个字节中的哪个是最低字节，等等。如果它检测到下一行字节不同的地方，那么列代码就完成了第一个字节并继续寻找第二低的字节..这实际上是我认为这个过程应该如何工作以获得良好的速度提升..但不幸的是我对这种排序技术有很大的困惑，最终使用了有人帮助我的东西.

当前代码通过蛮力排序从最后一列开始工作，它对所有行进行排序。然后将一列向左移动并重新对每一行重新排序，直到它到达第一列并对其进行排序。这很慢，因为它没有明显的原因进行迭代。

假设有 256 列和 256 行，总共 65,536 个数组元素.. 使用当前代码并说它必须对每一行进行多次排序，直到每一行得到正确的排序顺序。对于每一列，它可能需要 65,536 次迭代。因此，每次我调用该函数时，总共估计有 256*65536= 16,777,216次迭代，这就是它运行缓慢的实际原因。

我知道这有很多要求，但如果有人有空闲时间并且可能之前已经这样做过可以帮助我，我将不胜感激。

这是到目前为止我必须使用的代码。

byte[] sortArrayOfArraysLexicoGraphically(ref byte[] data) {
    byte[] lexicoGraphicalIndexes;
    long dataSize = data.Length;
    long squareRootMinusOne;
    int squareRoot;
    int row = 0;
    bool rowSwapped;
    byte[] tmpRow;

    squareRoot = (int)Math.Sqrt(dataSize);
    tmpRow = new byte[squareRoot];
    squareRootMinusOne = squareRoot - 1;
    lexicoGraphicalIndexes = new byte[squareRoot];

    for(short column = 0; column < lexicoGraphicalIndexes.Length; column++) {
        lexicoGraphicalIndexes[column] = (byte)column;
    }

    for(long column = squareRootMinusOne; column >= 0; column -= 1) {
        do {
            rowSwapped = false;
            do {
                if(data[(row * squareRoot) + column] > data[((row + 1) * squareRoot) + column]) {
                    //Swaps a full row in a few copies.
                    //Copies full row to tmpRow
                    Buffer.BlockCopy(data, (row * squareRoot), tmpRow, 0, squareRoot);
                    //Replace first row with second row.
                    Buffer.BlockCopy(data, ((row + 1) * squareRoot), data, (row * squareRoot), squareRoot);
                    //Replace second row with tmpRow
                    Buffer.BlockCopy(tmpRow, 0, data, ((row + 1) * squareRoot), squareRoot);
                    swapBytes(ref lexicoGraphicalIndexes, row, row + 1);
                    rowSwapped = true;
                }
                row++;
            } while (row < squareRootMinusOne);
            row = 0;
        } while (rowSwapped != false);
    }
    return lexicoGraphicalIndexes;
}

public void swapBytes(ref byte[] data, long firstIndex, long secondIndex) {
    byte tmpFirstByte = data[firstIndex];
    data[firstIndex] = data[secondIndex];
    data[secondIndex] = tmpFirstByte;
}

score 2 · Accepted Answer

我必须说你的排序算法非常糟糕。即使没有任何优化并使用基本的 linq，您也可以获得数十倍的速度。

我用大小为 N*N 的数据进行了测试，其中 N=200（我不确定以下代码是否与您的完全匹配并且 100% 正确，但至少您可以尝试查看结果）

List<byte[]> result = data.Batch(N)
                          .OrderBy(b => b, new ArrayComparer())
                          .Select(b => b.ToArray())
                          .ToList();

编辑

就地排序甚至可以更快。

var list = data.Batch(N).Select(x => x.ToArray()).ToList();
list.Sort(new ArrayComparer());

-

public class ArrayComparer : IComparer<IEnumerable<byte>>
{
    public int Compare(IEnumerable<byte> x, IEnumerable<byte> y)
    {
        var xenum = x.GetEnumerator();
        var yenum = y.GetEnumerator();
        while (xenum.MoveNext() && yenum.MoveNext())
        {
            if (xenum.Current != yenum.Current) 
                   return xenum.Current - yenum.Current;
        }
        return 0;
    }
}

PS：Batch是morelinq的扩展方法

score 0 · Accepted Answer

结束了写这个长怪物，但它似乎做了一些测试运行的伎俩..不确定它是否完美需要更多测试，当我做更多测试时我会更新这个。

    int[] sortArrayOfArraysLexicoGraphically(ref byte[] data) {
        int[] lexicoGraphicalIndexes;
        long dataSize = data.Length;
        int squareRoot;
        bool rowSwapped;

        squareRoot = (int)Math.Sqrt(dataSize);
        lexicoGraphicalIndexes = new int[squareRoot];

        for(int column = 0; column < lexicoGraphicalIndexes.Length; column++) {
            lexicoGraphicalIndexes[column] = column;
        }

        byte currentLowestRowByte = 255; //set to highest to avoid unassigned local variable error.
        int previousLowestRowByte = -1; //this is only used after the second pass.
        int lowestRowIndex = -1; //hopefully this won't mess anything up.
        List<int> lowestRowIndexes = new List<int>();
        bool stillSorting = true;
        int startRow = 0; //which row to start with, as the sorting process gets more sorted this number increases.
        int startColumn = 0; //first column

        while(stillSorting) {
            //Resets
            lowestRowIndexes.Clear();
            startColumn = 0;
            currentLowestRowByte = 255;
            lowestRowIndex = -1;

            //first step finds the lowest row in the first column
            for(int row = 0; row < squareRoot; row += 1) {
                if(data[(row * squareRoot) + startColumn] <= currentLowestRowByte && 
                    data[(row * squareRoot) + startColumn] > previousLowestRowByte) {
                    currentLowestRowByte = data[(row * squareRoot) + startColumn];
                    lowestRowIndex = row;
                }
            }

            //Resets for next pass.
            previousLowestRowByte = currentLowestRowByte;

            //Check if sorting process is already finished. (No matches found from step 1).
            if(lowestRowIndex == -1) {
                stillSorting = false;
                break;
            }

            //second step finds all the similar rows with the current lowestRowByte.
            for(int row = 0; row < squareRoot; row += 1) {
                if(data[(row * squareRoot) + startColumn] == currentLowestRowByte) {
                    lowestRowIndexes.Add(row);
                }
            }

            //third step loops through all lowestRowIndexes to find which one comes first, second, third, etc...
            if(lowestRowIndexes.Count > 1) {
                //This sorts the same rows, rows*rows amount of times, until they are sorted correctly.
                rowSwapped = true;
                while(rowSwapped != false) {
                    rowSwapped = false;
                    for (int row = 0; row < lowestRowIndexes.Count; row++)
                    {
                        if((row+1) >= lowestRowIndexes.Count)
                            break;
                        //Current first row byte checked with Next first row byte in lowestRowIndexes.
                        //If both are equal keep going unto next column until a break is found, if any break.
                        startColumn = 1;
                        while(rowSwapped == false) {
                            //Reached beyond the last column.
                            if(startColumn == squareRoot)
                                break;

                            if(data[(lowestRowIndexes[row] * squareRoot) + startColumn] == data[(lowestRowIndexes[row+1] * squareRoot) + startColumn])
                                startColumn++;

                            if(data[(lowestRowIndexes[row] * squareRoot) + startColumn] < data[(lowestRowIndexes[row+1] * squareRoot) + startColumn]) {
                                break; //Sorted already, get out.
                            } else if(data[(lowestRowIndexes[row] * squareRoot) + startColumn] > data[(lowestRowIndexes[row+1] * squareRoot) + startColumn]) {
                                swapBytesRow(ref data, lowestRowIndexes[row], lowestRowIndexes[row+1], squareRoot);
                                swapBytes(ref lexicoGraphicalIndexes, lowestRowIndexes[row], lowestRowIndexes[row+1]);
                                rowSwapped = true; //a swap has occurred.
                                break;
                            }
                        }
                    }
                }

                //forth step re-sorts all the pre-sorted lowestRowIndexes into master array, using startRow variable.
                foreach(int row in lowestRowIndexes) {

                    //First checks if row is already in the proper sorted location.
                    if(row != startRow) {
                        swapBytesRow(ref data, startRow, row, squareRoot);
                        swapBytes(ref lexicoGraphicalIndexes, startRow, row);
                        startRow++; //skip Rows starting from value < startRow as they are perfectly sorted.
                    } else {
                        startRow++; //skip Rows starting from value < startRow as they are perfectly sorted.
                    }                     
                }
            } else {
                //Only one instance of this lowestRowByte existed. so obviously this is the next best sorted match.
                swapBytesRow(ref data, startRow, lowestRowIndex, squareRoot);
                swapBytes(ref lexicoGraphicalIndexes, startRow, lowestRowIndex);
                startRow++; //skip Rows starting from value < startRow as they are perfectly sorted.
            }
        }
        return lexicoGraphicalIndexes;
    }

.

    public void swapBytes(ref byte[] data, long firstIndex, long secondIndex) {
        byte tmpFirstByte = data[firstIndex];
        data[firstIndex] = data[secondIndex];
        data[secondIndex] = tmpFirstByte;
    }

.

    public void swapBytes(ref int[] data, long firstIndex, long secondIndex) {
        int tmpFirstByte = data[firstIndex];
        data[firstIndex] = data[secondIndex];
        data[secondIndex] = tmpFirstByte;
    }

.

    public void swapBytesRow(ref byte[] data, int firstRowIndex, int secondRowIndex, int rowSize) {
        byte[] tmpFirstRowBytes = new byte[rowSize];
        //Copies full row to tmpFirstRowBytes
        Buffer.BlockCopy(data, (firstRowIndex * rowSize), tmpFirstRowBytes, 0, rowSize);
        //Replace first row with second row.
        Buffer.BlockCopy(data, (secondRowIndex * rowSize), data, (firstRowIndex * rowSize), rowSize);
        //Replace second row with tmpFirstRowBytes
        Buffer.BlockCopy(tmpFirstRowBytes, 0, data, (secondRowIndex * rowSize), rowSize);
    }

c# - 使用 C# 的数组算法的字典排序数组

2 回答 2

Related

Reference