2

我需要做的是比较两个字符串并用开始/结束标记标记差异以进行更改。例子:

这是第一个字符串。
这个字符串是第二个字符串。

输出将是

this [is|string is] 字符串编号 [one|two]。  

我一直在试图弄清楚这一点。我发现一些我认为可以帮助我做到这一点的东西,但我无法做到这一点。
http://www.angusj.com/delphi/textdiff.html

我有大约 80% 在这里工作,但我不知道如何让它完全按照我的意愿工作。任何帮助,将不胜感激。


uses
  Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
  Dialogs, Diff, StdCtrls;

type
  TForm2 = class(TForm)
    Edit1: TEdit;
    Edit2: TEdit;
    Button1: TButton;
    Memo1: TMemo;
    Diff: TDiff;
    procedure Button1Click(Sender: TObject);
  private
    { Private declarations }
  public
    { Public declarations }
  end;

var
  Form2: TForm2;
  s1,s2:string;

implementation

{$R *.dfm}

procedure TForm2.Button1Click(Sender: TObject);
var
  i: Integer;
  lastKind: TChangeKind;

  procedure AddCharToStr(var s: string; c: char; kind, lastkind: TChangeKind);
  begin
    if (kind  lastkind) AND (lastkind = ckNone) and (kind  ckNone) then s:=s+'[';
    if (kind  lastkind) AND (lastkind  ckNone) and (kind = ckNone) then s:=s+']';
    case kind of         
      ckNone: s := s  + c;
      ckAdd: s := s + c;         
      ckDelete: s := s  + c;
      ckModify: s := s + '|' + c;
    end;
  end;

begin
  Diff.Execute(pchar(edit1.text), pchar(edit2.text), length(edit1.text), length(edit2.text));

  //now, display the diffs ...
  lastKind := ckNone;
  s1 := ''; s2 := '';
  form2.caption:= inttostr(diff.Count);
  for i := 0 to Diff.count-1 do
  begin

    with Diff.Compares[i] do
    begin
      //show changes to first string (with spaces for adds to align with second string)
      if Kind = ckAdd then 
      begin 
        AddCharToStr(s1,' ',Kind, lastKind); 
      end
      else 
      AddCharToStr(s1,chr1,Kind,lastKind);
      if Kind = ckDelete then 
      begin 
        AddCharToStr(s2,' ',Kind, lastKind)
      end
      else AddCharToStr(s2,chr2,Kind,lastKind);

      lastKind := Kind;
    end;
  end;
    memo1.Lines.Add(s1);
    memo1.Lines.Add(s2);
end;

end.

我从 angusj.com 获取了 basicdemo1 并对其进行了修改以实现这一目标。

4

2 回答 2

4

要解决您描述的问题,您基本上必须做一些类似于在DNA 或蛋白质数据的生物序列比对中所做的事情。如果您只有两个字符串(或一个常量引用字符串),则可以通过基于动态规划的成对对齐算法(例如Needleman Wunsch 算法* 和相关算法)来处理。(多序列比对变得更加复杂。)

[* 编辑:链接应该是:http ://en.wikipedia.org/wiki/Needleman –Wunsch_algorithm]

编辑2:

由于您似乎对在单词而不是字符级别进行比较感兴趣,因此您可以 (1) 将输入字符串拆分为字符串数组,其中每个数组元素代表一个单词,然后 (2) 在级别执行对齐这些话。这样做的好处是对齐的搜索空间变得更小,因此您希望它总体上更快。我已经相应地改编并“补充”了维基百科文章中的伪代码示例:


program AlignWords;

{$APPTYPE CONSOLE}

function MaxChoice (C1, C2, C3: integer): integer; begin Result:= C1; if C2 > Result then Result:= C2; if C3 > Result then Result:= C3; end;

function WordSim (const S1, S2: String): integer; overload; //Case-sensitive! var i, l1, l2, minL: integer; begin l1:= length(S1); l2:= length(S2); Result:= l1-l2; if Result > 0 then Result:= -Result; if (S1='') or (S2='') then exit; minL:= l1; if l2 < l1 then minL:= l2; for i := 1 to minL do if S1[i]<>S2[i] then dec(Result); end;

procedure AlignWordsNW (const A, B: Array of String; GapChar: Char; const Delimiter: ShortString; GapPenalty: integer; out AlignmentA, AlignmentB: string); // Needleman-Wunsch alignment // GapPenalty should be a negative value! var F: array of array of integer; i, j, Choice1, Choice2, Choice3, Score, ScoreDiag, ScoreUp, ScoreLeft :integer; function GapChars (const S: String): String; var i: integer; begin assert (length(S)>0); Result:=''; for i := 0 to length(S) - 1 do Result:=Result + GapChar; end; begin SetLength (F, length(A)+1, length(B)+1); for i := 0 to length(A) do F[i,0]:= GapPenaltyi; for j := 0 to length(B) do F[0,j]:= GapPenaltyj; for i:=1 to length(A) do begin for j:= 1 to length(B) do begin Choice1:= F[i-1,j-1] + WordSim(A[i-1], B[j-1]); Choice2:= F[i-1, j] + GapPenalty; Choice3:= F[i, j-1] + GapPenalty; F[i,j]:= maxChoice (Choice1, Choice2, Choice3); end; end; AlignmentA:= ''; AlignmentB:= ''; i:= length(A); j:= length(B); while (i > 0) and (j > 0) do begin Score := F[i,j]; ScoreDiag:= F[i-1,j-1]; ScoreUp:= F[i,j-1]; ScoreLeft:= F[i-1,j]; if Score = ScoreDiag + WordSim(A[i-1], B[j-1]) then begin AlignmentA:= A[i-1] + Delimiter + AlignmentA; AlignmentB:= B[j-1] + Delimiter + AlignmentB; dec (i); dec (j); end else if Score = ScoreLeft + GapPenalty then begin AlignmentA:= A[i-1] + Delimiter + AlignmentA; AlignmentB:= GapChars (A[i-1]) + Delimiter + AlignmentB; dec(i); end else begin assert (Score = ScoreUp + GapPenalty); AlignmentA:= GapChars(B[j-1]) + Delimiter + AlignmentA; AlignmentB:= B[j-1] + Delimiter + AlignmentB; dec (j); end; end; while (i > 0) do begin AlignmentA:= A[i-1] + Delimiter + AlignmentA; AlignmentB:= GapChars(A[i-1]) + Delimiter + AlignmentB; dec(i); end; while (j > 0) do begin AlignmentA:= GapChars(B[j-1]) + Delimiter + AlignmentA; AlignmentB:= B[j-1] + Delimiter + AlignmentB; dec(j); end; end;

Type TStringArray = Array Of String;

Var as1, as2: TStringArray; s1, s2: string;

BEGIN as1:= TStringArray.create ('this','is','string','number','one.'); as2:= TStringArray.Create ('this','string','is','string','number','two.');

AlignWordsNW (as1, as2, '-',' ',-1, s1,s2);
writeln (s1);
writeln (s2);

END.

这个例子的输出是

这 ------ 是字符串编号---- 一个。
这个字符串是第二个字符串。----

它并不完美,但你明白了。从这种输出中,你应该能够做你想做的事。请注意,您可能需要调整GapPenalty和 相似度评分函数WordSim以满足您的需求。

于 2010-01-26T18:31:26.453 回答
1

有一个可用的Object Pascal Diff Engine可能会有所帮助。您可能希望将每个“单词”分成单独的行进行比较,或者修改算法以逐字比较。

于 2010-01-26T21:14:21.447 回答