0

我有一个记录类型和一个由该记录类型组成的动态数组。我将它传递给一个合并排序例程并尝试将它的一个字段属性设置为布尔值,但似乎没有生效。

我通过其他方式研究了对记录的排序数组(参见自定义记录数组的快速排序:http ://en.wikibooks.org/wiki/Algorithm_Implementation/Sorting/Quicksort#Delphi )或这里:对数组进行排序的最佳方法(我可以没有得到这些建议,主要是因为创建了一个comaring函数)。这个问题:按字母顺序排列数组?很有帮助并且有效,但是这种排序非常慢。

代码:

type    
       TCustomRecord = Record
        fLine     : AnsiString; //full line
        fsubLine     : AnsiString; // part of full line
        isDuplicate : boolean;  //is that subline duplicate in another line
        isRefrence     : boolean; // is this line from a refrence file or the one being deduped
        fIndex    : Cardinal; // original order line was loaded
       end;
      TCustomRecordArray = array of TCustomRecord; 

function Merge2(var Vals: array of TCustomRecord ):Integer;
var
  AVals: array of TCustomRecord;

   //returns index of the last valid element
  function Merge(I0, I1, J0, J1: Integer):Integer;
  var
    i, j, k, LC:Integer;
  begin
    LC := I1 - I0;
    for i := 0 to LC do
      AVals[i]:=Vals[i + I0];
      //copy lower half or Vals into temporary array AVals

    k := I0;
    i := 0;
    j := J0;
    while ((i <= LC) and (j <= J1)) do
    if (AVals[i].fsubLine < Vals[j].fsubLine) then
    begin
      Vals[k] := AVals[i];
      if Vals[k].isRefrence = False then
        Vals[k].isDuplicate := False;
      inc(i);
      inc(k);
    end
    else if (AVals[i].fsubLine > Vals[j].fsubLine) then
    begin
      Vals[k]:=Vals[j];
      if Vals[k].isRefrence = False then
        Vals[k].isDuplicate := False;
      inc(k);
      inc(j);
    end else
    begin //duplicate
      Vals[k] := AVals[i];
      if Vals[k].isRefrence = False then
        Vals[k].isDuplicate := True;
      inc(i);
      inc(j);
      inc(k);
    end;

    //copy the rest
    while i <= LC do begin
      Vals[k] := AVals[i];
      inc(i);
      inc(k);
    end;

    if k <> j then
      while j <= J1 do begin
        Vals[k]:=Vals[j];
        inc(k);
        inc(j);
      end;

    Result := k - 1;
  end;

 //returns index of the last valid element

  function PerformMergeSort(ALo, AHi:Integer): Integer; //returns
  var
    AMid, I1, J1:Integer;
  begin

  //It would be wise to use Insertion Sort when (AHi - ALo) is small (about 32-100)
    if (ALo < AHi) then
    begin
      AMid:=(ALo + AHi) shr 1;
      I1 := PerformMergeSort(ALo, AMid);
      J1 := PerformMergeSort(AMid + 1, AHi);
      Result := Merge(ALo, I1, AMid + 1, J1);
    end else
      Result := ALo;
  end;

begin
  //SetLength(AVals, Length(Vals) + 1 div 2);
  SetLength(AVals, Length(Vals) div 2 + 1);
  Result := 1 + PerformMergeSort(0, High(Vals));
end;

问题:我怎样才能有效地排序,最好使用mergesort,这个记录数组并根据该排序设置它的一些属性?谢谢你。

更新:我添加了一个指针类型并对指针数组进行了修改的合并排序。结果证明这是对记录数组进行排序的非常快速的方法。我还添加了一个比较例程,它添加了我需要的标志。我无法做的唯一部分是根据它们是否属于文件 A 或参考文件来为重复项添加一个标志。

代码:

    type    
          PCustomRecord = ^TCustomRecord; 
          TCustomRecord = Record
            fLine     : AnsiString; //full line
            fsubLine  : AnsiString; // part of full line
            isDuplicate : boolean;  //is that subline duplicate in another line
            isRefrence     : boolean; // line from a refrence file or the one being deduped
            isUnique  : boolean; //flag to set if not refrence and not dupe
            fIndex    : Cardinal; // original order line was loaded
           end;
          TCustomRecordArray = array of TCustomRecord;
          PCustomRecordList = ^TCustomRecordArray;

//set up actual array
//set up pointer array to point at actual array
//sort by mergesort first
// then call compare function - this can be a procedure obviously

function Compare(var PRecords: array of PCustomRecord; iLength: int64): Integer;
var
  i : Integer;
begin
  for i := 0 to High(PRecords) do
  begin
    Result := AnsiCompareStr(PRecords[i]^.fsubline, PRecords[i+1]^.fsubline);
    if Result=0 then
    begin
      if (PRecords[i].isrefrence = False) then
        PRecords[i].isduplicate := True
      else if (PRecords[i+1].isrefrence = False) then
        PRecords[i+1].isduplicate := True;
    end;
  end;
end; 

procedure MergeSort(var Vals:array of PCustomRecord;ACount:Integer);
var AVals:array of PCustomRecord;

  procedure Merge(ALo,AMid,AHi:Integer);
  var i,j,k,m:Integer;
  begin
    i:=0;
    for j:=ALo to AMid do
    begin
      AVals[i]:=Vals[j];
      inc(i);
      //copy lower half or Vals into temporary array AVals
    end;

    i:=0;j:=AMid + 1;k:=ALo;//j could be undefined after the for loop!
    while ((k < j) and (j <= AHi)) do
    if (AVals[i].fsubline) <= (Vals[j].fsubline) then
    begin
      Vals[k]:=AVals[i];
      inc(i);inc(k);
    end
    else if (AVals[i].fsubline) > (Vals[j].fsubline) then
    begin
      Vals[k]:=Vals[j];
      inc(k);inc(j);
    end;

    {locate next greatest value in Vals or AVals and copy it to the
     right position.}

    for m:=k to j - 1 do
    begin
      Vals[m]:=AVals[i];
      inc(i);
    end;
    //copy back any remaining, unsorted, elements
  end;

  procedure PerformMergeSort(ALo,AHi:Integer);
  var AMid:Integer;
  begin
    if (ALo < AHi) then
    begin
      AMid:=(ALo + AHi) shr 1;
      PerformMergeSort(ALo,AMid);
      PerformMergeSort(AMid + 1,AHi);
      Merge(ALo,AMid,AHi);
    end;
  end;

begin
  SetLength(AVals, ACount div 2 + 1);
  PerformMergeSort(0,ACount - 1);
end;

对于不到一秒钟的小文件,这一切都非常快。不过,对数组中带有重复标志而不是引用标志的项目进行重复数据删除是相当具有挑战性的。由于合并排序是一种稳定的排序,我尝试通过布尔标志进行排序,但没有得到我期望的结果。我用 aTStringlist查看我以前的标志是否设置正确并且工作正常。时间从 1 秒增加到 6 秒。我知道必须有一种简单的方法来isUnique标记没有 TStringlist.

这是我尝试过的:

function DeDupe(var PRecords: array of PCustomRecord; iLength: int64): Integer;
var
  i : Integer;
begin
  for i := 0 to High(PRecords) do
  begin
    if (PRecords[i]^.isrefrence = False) and (PRecords[i+1]^.isrefrence = false)then
    begin
      Result := AnsiCompareStr(PRecords[i]^.isduplicate, PRecords[i+1]^.isduplicate);
      if Result = 0 then PRecords[i]^.isUnique := True;
    end
    else
    begin
      Continue;
    end;
  end;
end;

这并没有得到所有的值,我没有看到它有什么不同,因为我仍然看到很多重复项。我认为逻辑是错误的。

感谢所有帮助的伟大灵魂。请允许我受益,我可能已经知道如何推导 aTObject以及如何使用 a TStringList,所以重点是数组。

问题:帮助我执行上述功能或程序,用 isRefrence = false 和 isDuplicate = True 和唯一标记重复项

编辑 3:我能够通过使用布尔标志来消除重复项。这有助于在不改变数组大小的情况下保持数组稳定。我相信它比使用TList后代或TStringList. 使用诸如数组之类的基本容器在易于编码方面存在局限性,但非常有效,因此我不会传递它。指针使排序变得轻而易举。我不确定当我使用指针数组时如何设置指向我的数组的指针,就像我使用我的常规数组一样。我是否取消了它并没有什么区别。我这样设置指针数组:

  iLength := Length(Custom_array); //get length of actual array
  SetLength(pcustomRecords, iLength); // make pointer array equal + 1

  for M := Low(Custom_array) to High(Custom_array) do //set up pointers
  begin
    pcustomRecords[M] := @Custom_array[M]; 
  end;

我尝试尽可能多地将排序与正在排序的实际数据分开,但我确信可以改进。

///////////////////////////////////////////////////////////////////
function Comparesubstring(Item1, Item2: PCustomRecord): Integer;
begin
  Result := AnsiCompareStr(item1^.fsubline, item2^.fsubline);
end;
///////////////////////////////////////////////////////////////////
function CompareLine(Item1, Item2: PCustomRecord): Integer;
begin
  Result := AnsiCompareStr(item1^.fLine, item2^.fLine);
end;
///////////////////////////////////////////////////////////////////
function Compare(var PRecords: array of PCustomRecord; iLength: int64): Integer;
var
  M, i : Integer;
begin
  M := Length(PRecords);
  for i := 1 to M-1 do
  begin
    Result := Comparesubstring(PRecords[i-1], PRecords[i]);
    if Result=0 then
    begin
      if (PRecords[i-1].isRefrence = False) then
        PRecords[i-1].isduplicate := True
      else if (PRecords[i].isRefrence = False) then
        PRecords[i].isduplicate := True;
    end;
  end;
end;
///////////////////////////////////////////////////////////////////
4

3 回答 3

4

1)不要复制数据!使用指针。您应该创建指向这些数据记录的指针列表/数组,然后对指针进行排序。排序完成后 - 只需根据指针数组创建新的数据数组。指针移动是单 CPU 命令。SizeOf(your record) 是 >> SizeOf(pointer) 并且在移动时要慢得多。

2) Mergesort 在大量数据上摇摆不定,不适合内存。如果您有 10 GB 的数据,则无法在 Win32 程序允许的 2 GB 内存中对它们进行排序。因此,您必须在它们在磁盘上时对其进行排序。这就是 Mergesort 的利基市场。如果所有数据都在内存中,为什么不使用现成的 QuickSort 例程呢?

所以制作一个TList,用指针填充它type PCustomRecord = ^TCustomRecord;,实现适当的比较函数并通过TList.Sort方法调用checked quicksort。

http://docwiki.embarcadero.com/CodeExamples/XE2/en/TListSort_(Delphi)

列表排序后 - 创建并填充新的数据数组。创建新数组后 - 释放列表并删除旧的源数组。


如果可能 - 检查数据是否适合内存。如果内存不足,仅驻留在磁盘上搜索。它会更慢,更慢。


我在学校做过...... Mergesort 不是递归的。这是非常基本的循环。由于它的简单性,我实现了它。我仍然没有对 QuickSort 的直觉,可以与之比较。

在伪代码中它看起来像

FrameSize := 1;
Loop start:
  Phase 1: splitting
     Loop until not empty TempMergedDataFile:
        Read record by record from TempMergedDataFile 
            and write each of them into TempSplitDataFile-1
            up to FrameSize times
        Read record by record from TempMergedDataFile 
            and write each of them into TempSplitDataFile-2
            up to FrameSize times
     Loop end
     Delete TempMergedDataFile 
  Phase 2: sorting-merging
     Loop until not empty TempSplitDataFile-1 and TempSplitDataFile-2:
        Read record by record from both TempSplitDataFile-1 and TempSplitDataFile-2
          up to FrameSize each (2xFrameSize in total in each iteration)
          write them sorted into TempMergedDataFile
     end loop
     delete TempSplitDataFile-1 and TempSplitDataFile-2
  Phase 3: update expectations
     FrameSize := FrameSize * 2
     if FrameSize > actual number of records - then exit loop, sort complete
End loop

小心第 2 阶段的实施。如果文件之一超出帧,则与实际值或 nil 进行比较。嗯,这个想法很明显,可能在某个地方演示过。只是在这部分迂腐。可能 FSM 实现可能很容易。

明显的优化:

  1. 将所有文件放在不同的物理专用 HDD 上,因此每个 HDD 将处于线性读/写模式
  2. 合并阶段 1 和阶段 2:使 TempMergedDataFile 虚拟化,实际上由 TempSplitDataFile-3 和 TempSplitDataFile-4 组成。在写入数据时将数据拆分为下一个大小的帧。
  3. 如果使用 SSD 或闪存卡进行存储,则数据复制会磨损硬件。最好对某种“指针”或“索引”进行排序以进行实际排序。也有很小的机会,虽然完整的数据帧超过了 RAM,但仅仅是“索引数组”就可以了。但是,对于未经测试的实际 HDD,我最好坚持使用幼稚的“复制和复制再复制”方法。
于 2012-10-11T13:01:46.987 回答
3

The first comment to make is that your basic design is very weak. You have mixed the sorting code and the compare/exchange code all in together. If ever you need to sort different data, then you'll have to start again. You need to decouple the sorting code from the code that understands the data.

The way to achieve that decoupling is to implement a generic sort routine that knows nothing about the data. Instead all it needs to know is how to compare two elements, and how to exchange two elements. All the common in-memory sorting routines can be implemented efficiently that way.

The other problem you have, I guess, is that your code will spend a lot of time copying the data around. Instead of doing that, use a layer of indirection. What I mean by that is that you should not attempt to modify the original array. Instead create an array of indices into the data array, and sort the array of indices rather than the array of data.

To give you an idea of that, here's how it might look:

var
  Data: array of TData;
  Indices: array of Integer;

function CompareIndices(Index1, Index2: Integer): Integer;
begin
  Result := CompareData(Data[Indices[Index1]], Data[Indices[Index2]]);
end;

procedure SwapIndices(Index1, Index2: Integer);
var
  Temp: Integer;
begin
  Temp := Indices[Index1];
  Indices[Index1] := Indices[Index2];
  Indices[Index2] := Temp;
end;

Then, in order to sort the array you do something like this:

N := Length(Data);
SetLength(Indices, N);
for i := 0 to high(Indices) do 
  Indices[i] := i;
Sort(CompareIndices, SwapIndices, N);

Or, as yet another alternative, instead of an array of indices, use an array of pointers to the elements of the data array.

Now, I've used global variables here for the sake of clarity of exposition. In reality you'd likely want to wrap this up into a class, or at least make the compare and swap functions be methods of objects. That's how I did it in my Delphi 6 code base. The interface looked like this:

type
  TCompareIndicesFunction = function(Index1, Index2: Integer): Integer of object;
  TExchangeIndicesProcedure = procedure(Index1, Index2: Integer) of object;

procedure QuickSort(Compare: TCompareIndicesFunction; 
  Exchange: TExchangeIndicesProcedure; const Count: Integer);

Once you get on top of the concept of separating the sort algo from the data then you'll make some progress. It then becomes trivial to swap out one sorting algo for another. You can compare them easily. You can readily measure whether or not the indirection approach is worthwhile. And so on.

So, my absolute number one piece of advice for you is to throw away the code in the question and separate sorting from data handling, as nature intended.

于 2012-10-11T13:26:15.297 回答
0

我知道没有直接回答这个问题,但前提是数据可以放入内存中——这似乎就像您使用数组一样。

我会转储所有这些,创建一些对象,将它们放在 TObjectList 中。使用 TObjectList.Sort(@myComparefunction) 使用您自己的比较进行排序。您可以声明多个排序例程。在排序功能期间,您可以随意设置其他对象属性。它非常快,并且可以减轻您似乎正在遭受的很多痛苦:)

于 2012-10-11T17:27:08.570 回答