更新:(原帖在下面)科林有一个绝妙的主意,将正则表达式实例移到调用之外,以便它们只创建一次。继承人的新程序:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Xml.Linq;
using System.Diagnostics;
using System.Text.RegularExpressions;
namespace ConsoleApplication3
{
delegate String xmltestFunc(String data);
class Program
{
static readonly int iterations = 1000000;
private static void benchmark(xmltestFunc func, String data, String expectedResult)
{
if (!func(data).Equals(expectedResult))
{
Console.WriteLine(data + ": fail");
return;
}
Stopwatch sw = Stopwatch.StartNew();
for (int i = 0; i < iterations; ++i)
func(data);
sw.Stop();
Console.WriteLine(data + ": " + (float)((float)sw.ElapsedMilliseconds / 1000));
}
static void Main(string[] args)
{
benchmark(xmltest1, "<tag>base</tag>", "base");
benchmark(xmltest1, " <tag>base</tag> ", "base");
benchmark(xmltest1, "base", "base");
benchmark(xmltest2, "<tag>ColinBurnett</tag>", "ColinBurnett");
benchmark(xmltest2, " <tag>ColinBurnett</tag> ", "ColinBurnett");
benchmark(xmltest2, "ColinBurnett", "ColinBurnett");
benchmark(xmltest3, "<tag>Si</tag>", "Si");
benchmark(xmltest3, " <tag>Si</tag> ", "Si" );
benchmark(xmltest3, "Si", "Si");
benchmark(xmltest4, "<tag>RashmiPandit</tag>", "RashmiPandit");
benchmark(xmltest4, " <tag>RashmiPandit</tag> ", "RashmiPandit");
benchmark(xmltest4, "RashmiPandit", "RashmiPandit");
benchmark(xmltest5, "<tag>Custom</tag>", "Custom");
benchmark(xmltest5, " <tag>Custom</tag> ", "Custom");
benchmark(xmltest5, "Custom", "Custom");
// "press any key to continue"
Console.WriteLine("Done.");
Console.ReadLine();
}
public static String xmltest1(String data)
{
try
{
return XElement.Parse(data).Value;
}
catch (System.Xml.XmlException)
{
return data;
}
}
static Regex xmltest2regex = new Regex("^[ \t\r\n]*<");
public static String xmltest2(String data)
{
// Has to have length to be XML
if (!string.IsNullOrEmpty(data))
{
// If it starts with a < then it probably is XML
// But also cover the case where there is indeterminate whitespace before the <
if (data[0] == '<' || xmltest2regex.Match(data).Success)
{
try
{
return XElement.Parse(data).Value;
}
catch (System.Xml.XmlException)
{
return data;
}
}
}
return data;
}
static Regex xmltest3regex = new Regex(@"<(?<tag>\w*)>(?<text>.*)</\k<tag>>");
public static String xmltest3(String data)
{
Match m = xmltest3regex.Match(data);
if (m.Success)
{
GroupCollection gc = m.Groups;
if (gc.Count > 0)
{
return gc["text"].Value;
}
}
return data;
}
public static String xmltest4(String data)
{
String result;
if (!XmlExpresssion.TryParse(data, out result))
result = data;
return result;
}
static Regex xmltest5regex = new Regex("^[ \t\r\n]*<");
public static String xmltest5(String data)
{
// Has to have length to be XML
if (!string.IsNullOrEmpty(data))
{
// If it starts with a < then it probably is XML
// But also cover the case where there is indeterminate whitespace before the <
if (data[0] == '<' || data.Trim()[0] == '<' || xmltest5regex.Match(data).Success)
{
try
{
return XElement.Parse(data).Value;
}
catch (System.Xml.XmlException)
{
return data;
}
}
}
return data;
}
}
public class XmlExpresssion
{
// EXPLANATION OF EXPRESSION
// < : \<{1}
// text : (?<xmlTag>\w+) : xmlTag is a backreference so that the start and end tags match
// > : >{1}
// xml data : (?<data>.*) : data is a backreference used for the regex to return the element data
// </ : <{1}/{1}
// text : \k<xmlTag>
// > : >{1}
// (\w|\W)* : Matches attributes if any
// Sample match and pattern egs
// Just to show how I incrementally made the patterns so that the final pattern is well-understood
// <text>data</text>
// @"^\<{1}(?<xmlTag>\w+)\>{1}.*\<{1}/{1}\k<xmlTag>\>{1}$";
//<text />
// @"^\<{1}(?<xmlTag>\w+)\s*/{1}\>{1}$";
//<text>data</text> or <text />
// @"^\<{1}(?<xmlTag>\w+)((\>{1}.*\<{1}/{1}\k<xmlTag>)|(\s*/{1}))\>{1}$";
//<text>data</text> or <text /> or <text attr='2'>xml data</text> or <text attr='2' attr2 >data</text>
// @"^\<{1}(?<xmlTag>\w+)(((\w|\W)*\>{1}(?<data>.*)\<{1}/{1}\k<xmlTag>)|(\s*/{1}))\>{1}$";
private static string XML_PATTERN = @"^\<{1}(?<xmlTag>\w+)(((\w|\W)*\>{1}(?<data>.*)\<{1}/{1}\k<xmlTag>)|(\s*/{1}))\>{1}$";
private static Regex regex = new Regex(XML_PATTERN, RegexOptions.Compiled);
// Checks if the string is in xml format
private static bool IsXml(string value)
{
return regex.IsMatch(value);
}
/// <summary>
/// Assigns the element value to result if the string is xml
/// </summary>
/// <returns>true if success, false otherwise</returns>
public static bool TryParse(string s, out string result)
{
if (XmlExpresssion.IsXml(s))
{
result = regex.Match(s).Result("${data}");
return true;
}
else
{
result = null;
return false;
}
}
}
}
以下是新结果:
<tag>base</tag>: 3.667
<tag>base</tag> : 3.707
base: 40.737
<tag>ColinBurnett</tag>: 3.707
<tag>ColinBurnett</tag> : 4.784
ColinBurnett: 0.413
<tag>Si</tag>: 2.016
<tag>Si</tag> : 2.141
Si: 0.087
<tag>RashmiPandit</tag>: 12.305
<tag>RashmiPandit</tag> : fail
RashmiPandit: 0.131
<tag>Custom</tag>: 3.761
<tag>Custom</tag> : 3.866
Custom: 0.329
Done.
你有它。预编译的正则表达式是要走的路,而且启动效率很高。
(原帖)
我拼凑了以下程序来对为此答案提供的代码示例进行基准测试,以演示我的帖子的推理以及评估私有答案的速度。
事不宜迟,程序如下。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Xml.Linq;
using System.Diagnostics;
using System.Text.RegularExpressions;
namespace ConsoleApplication3
{
delegate String xmltestFunc(String data);
class Program
{
static readonly int iterations = 1000000;
private static void benchmark(xmltestFunc func, String data, String expectedResult)
{
if (!func(data).Equals(expectedResult))
{
Console.WriteLine(data + ": fail");
return;
}
Stopwatch sw = Stopwatch.StartNew();
for (int i = 0; i < iterations; ++i)
func(data);
sw.Stop();
Console.WriteLine(data + ": " + (float)((float)sw.ElapsedMilliseconds / 1000));
}
static void Main(string[] args)
{
benchmark(xmltest1, "<tag>base</tag>", "base");
benchmark(xmltest1, " <tag>base</tag> ", "base");
benchmark(xmltest1, "base", "base");
benchmark(xmltest2, "<tag>ColinBurnett</tag>", "ColinBurnett");
benchmark(xmltest2, " <tag>ColinBurnett</tag> ", "ColinBurnett");
benchmark(xmltest2, "ColinBurnett", "ColinBurnett");
benchmark(xmltest3, "<tag>Si</tag>", "Si");
benchmark(xmltest3, " <tag>Si</tag> ", "Si" );
benchmark(xmltest3, "Si", "Si");
benchmark(xmltest4, "<tag>RashmiPandit</tag>", "RashmiPandit");
benchmark(xmltest4, " <tag>RashmiPandit</tag> ", "RashmiPandit");
benchmark(xmltest4, "RashmiPandit", "RashmiPandit");
// "press any key to continue"
Console.WriteLine("Done.");
Console.ReadLine();
}
public static String xmltest1(String data)
{
try
{
return XElement.Parse(data).Value;
}
catch (System.Xml.XmlException)
{
return data;
}
}
public static String xmltest2(String data)
{
// Has to have length to be XML
if (!string.IsNullOrEmpty(data))
{
// If it starts with a < then it probably is XML
// But also cover the case where there is indeterminate whitespace before the <
if (data[0] == '<' || new Regex("^[ \t\r\n]*<").Match(data).Success)
{
try
{
return XElement.Parse(data).Value;
}
catch (System.Xml.XmlException)
{
return data;
}
}
}
return data;
}
public static String xmltest3(String data)
{
Regex regex = new Regex(@"<(?<tag>\w*)>(?<text>.*)</\k<tag>>");
Match m = regex.Match(data);
if (m.Success)
{
GroupCollection gc = m.Groups;
if (gc.Count > 0)
{
return gc["text"].Value;
}
}
return data;
}
public static String xmltest4(String data)
{
String result;
if (!XmlExpresssion.TryParse(data, out result))
result = data;
return result;
}
}
public class XmlExpresssion
{
// EXPLANATION OF EXPRESSION
// < : \<{1}
// text : (?<xmlTag>\w+) : xmlTag is a backreference so that the start and end tags match
// > : >{1}
// xml data : (?<data>.*) : data is a backreference used for the regex to return the element data
// </ : <{1}/{1}
// text : \k<xmlTag>
// > : >{1}
// (\w|\W)* : Matches attributes if any
// Sample match and pattern egs
// Just to show how I incrementally made the patterns so that the final pattern is well-understood
// <text>data</text>
// @"^\<{1}(?<xmlTag>\w+)\>{1}.*\<{1}/{1}\k<xmlTag>\>{1}$";
//<text />
// @"^\<{1}(?<xmlTag>\w+)\s*/{1}\>{1}$";
//<text>data</text> or <text />
// @"^\<{1}(?<xmlTag>\w+)((\>{1}.*\<{1}/{1}\k<xmlTag>)|(\s*/{1}))\>{1}$";
//<text>data</text> or <text /> or <text attr='2'>xml data</text> or <text attr='2' attr2 >data</text>
// @"^\<{1}(?<xmlTag>\w+)(((\w|\W)*\>{1}(?<data>.*)\<{1}/{1}\k<xmlTag>)|(\s*/{1}))\>{1}$";
private const string XML_PATTERN = @"^\<{1}(?<xmlTag>\w+)(((\w|\W)*\>{1}(?<data>.*)\<{1}/{1}\k<xmlTag>)|(\s*/{1}))\>{1}$";
// Checks if the string is in xml format
private static bool IsXml(string value)
{
return Regex.IsMatch(value, XML_PATTERN);
}
/// <summary>
/// Assigns the element value to result if the string is xml
/// </summary>
/// <returns>true if success, false otherwise</returns>
public static bool TryParse(string s, out string result)
{
if (XmlExpresssion.IsXml(s))
{
Regex r = new Regex(XML_PATTERN, RegexOptions.Compiled);
result = r.Match(s).Result("${data}");
return true;
}
else
{
result = null;
return false;
}
}
}
}
这是结果。每一个都被执行了 100 万次。
<tag>base</tag>: 3.531
<tag>base</tag> : 3.624
base: 41.422
<tag>ColinBurnett</tag>: 3.622
<tag>ColinBurnett</tag> : 16.467
ColinBurnett: 7.995
<tag>Si</tag>: 19.014
<tag>Si</tag> : 19.201
Si: 15.567
测试 4 耗时太长,因为 30 分钟后它被认为太慢了。为了证明它有多慢,这里是同一个测试只运行了 1000 次。
<tag>base</tag>: 0.004
<tag>base</tag> : 0.004
base: 0.047
<tag>ColinBurnett</tag>: 0.003
<tag>ColinBurnett</tag> : 0.016
ColinBurnett: 0.008
<tag>Si</tag>: 0.021
<tag>Si</tag> : 0.017
Si: 0.014
<tag>RashmiPandit</tag>: 3.456
<tag>RashmiPandit</tag> : fail
RashmiPandit: 0
Done.
推断一百万次处决,将需要 3456 秒,或仅超过 57 分钟。
这是一个很好的例子,说明如果您正在寻找高效的代码,为什么复杂的正则表达式不是一个好主意。然而,它表明在某些情况下,简单的正则表达式仍然是一个很好的答案 - 即 colinBurnett 答案中 xml 的小“预测试”创建了一个可能更昂贵的基本案例,(正则表达式是在案例 2 中创建的)但也更短的 else通过避免异常的情况。