到目前为止,我不喜欢任何答案。因此,为了真正回答 OP 用 Int.Parse 提出的“最快/最有效”的 String.Split 问题,我编写并测试了一些代码。
在 Intel 3770k 上使用 Mono。
我发现使用 String.Split + IEnum.Select 并不是最快(也许是最漂亮)的解决方案。事实上,它是最慢的。
这是一些基准测试结果
ListSize 1000 : StringLen 10468
SplitForEach1000 Time : 00:00:02.8704048
SplitSelect1000 Time : 00:00:02.9134658
ForEachChar1000 Time : 00:00:01.8254438
SplitParallelSelectr1000 Time : 00:00:07.5421146
ForParallelForEachChar1000 Time : 00:00:05.3534218
ListSize 100000 : StringLen 1048233
SplitForEach100000 Time : 00:00:01.9500846
SplitSelect100000 Time : 00:00:02.2662606
ForEachChar100000 Time : 00:00:01.2554577
SplitParallelSelectr100000 Time : 00:00:02.6509969
ForParallelForEachChar100000 Time : 00:00:01.5842131
ListSize 10000000 : StringLen 104824707
SplitForEach10000000 Time : 00:00:18.2658261
SplitSelect10000000 Time : 00:00:20.6043874
ForEachChar10000000 Time : 00:00:10.0555613
SplitParallelSelectr10000000 Time : 00:00:18.1908017
ForParallelForEachChar10000000 Time : 00:00:08.6756213
这是获取基准测试结果的代码
using System;
using System.Collections.Generic;
using System.Collections.Concurrent;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using System.Diagnostics;
namespace FastStringSplit
{
class MainClass
{
public static void Main (string[] args)
{
Random rnd = new Random();
char delim = ':';
int[] sizes = new int[]{1000, 100000, 10000000 };
int[] iters = new int[]{10000, 100, 10};
Stopwatch sw;
List<int> list, result = new List<int>();
string str;
for(int s=0; s<sizes.Length; s++) {
list = new List<int>(sizes[s]);
for(int i=0; i<sizes[s]; i++)
list.Add (rnd.Next());
str = string.Join(":", list);
Console.WriteLine(string.Format("\nListSize {0} : StringLen {1}", sizes[s], str.Length));
////
sw = new Stopwatch();
for(int i=0; i<iters[s]; i++) {
sw.Start();
result = SplitForEach(str, delim);
sw.Stop();
}
Console.WriteLine("SplitForEach" + result.Count + " Time : " + sw.Elapsed.ToString());
////
sw = new Stopwatch();
for(int i=0; i<iters[s]; i++) {
sw.Start();
result = SplitSelect(str, delim);
sw.Stop();
}
Console.WriteLine("SplitSelect" + result.Count + " Time : " + sw.Elapsed.ToString());
////
sw = new Stopwatch();
for(int i=0; i<iters[s]; i++) {
sw.Start();
result = ForEachChar(str, delim);
sw.Stop();
}
Console.WriteLine("ForEachChar" + result.Count + " Time : " + sw.Elapsed.ToString());
////
sw = new Stopwatch();
for(int i=0; i<iters[s]; i++) {
sw.Start();
result = SplitParallelSelect(str, delim);
sw.Stop();
}
Console.WriteLine("SplitParallelSelectr" + result.Count + " Time : " + sw.Elapsed.ToString());
////
sw = new Stopwatch();
for(int i=0; i<iters[s]; i++) {
sw.Start();
result = ForParallelForEachChar(str, delim);
sw.Stop();
}
Console.WriteLine("ForParallelForEachChar" + result.Count + " Time : " + sw.Elapsed.ToString());
}
}
public static List<int> SplitForEach(string s, char delim) {
List<int> result = new List<int>();
foreach(string x in s.Split(delim))
result.Add(int.Parse (x));
return result;
}
public static List<int> SplitSelect(string s, char delim) {
return s.Split(delim)
.Select(int.Parse)
.ToList();
}
public static List<int> ForEachChar(string s, char delim) {
List<int> result = new List<int>();
int start = 0;
int end = 0;
foreach(char x in s) {
if(x == delim || end == s.Length - 1) {
if(end == s.Length - 1)
end++;
result.Add(int.Parse (s.Substring(start, end-start)));
start = end + 1;
}
end++;
}
return result;
}
public static List<int> SplitParallelSelect(string s, char delim) {
return s.Split(delim)
.AsParallel()
.Select(int.Parse)
.ToList();
}
public static int NumOfThreads = Environment.ProcessorCount > 2 ? Environment.ProcessorCount : 2;
public static List<int> ForParallelForEachChar(string s, char delim) {
int chunkSize = (s.Length / NumOfThreads) + 1;
ConcurrentBag<int> result = new ConcurrentBag<int>();
int[] chunks = new int[NumOfThreads+1];
Task[] tasks = new Task[NumOfThreads];
for(int x=0; x<NumOfThreads; x++) {
int next = chunks[x] + chunkSize;
while(next < s.Length) {
if(s[next] == delim)
break;
next++;
}
//Console.WriteLine(next);
chunks[x+1] = Math.Min(next, s.Length);
tasks[x] = Task.Factory.StartNew((o) => {
int chunkId = (int)o;
int start = chunks[chunkId];
int end = chunks[chunkId + 1];
if(start >= s.Length)
return;
if(s[start] == delim)
start++;
//Console.WriteLine(string.Format("{0} {1}", start, end));
for(int i = start; i<end; i++) {
if(s[i] == delim || i == end-1) {
if(i == end-1)
i++;
result.Add(int.Parse (s.Substring(start, i-start)));
start = i + 1;
}
}
}, x);
}
Task.WaitAll(tasks);
return result.ToList();
}
}
}
这是我推荐的功能
public static List<int> ForEachChar(string s, char delim) {
List<int> result = new List<int>();
int start = 0;
int end = 0;
foreach(char x in s) {
if(x == delim || end == s.Length - 1) {
if(end == s.Length - 1)
end++;
result.Add(int.Parse (s.Substring(start, end-start)));
start = end + 1;
}
end++;
}
return result;
}
为什么它更快?
它不会首先将字符串拆分为数组。它同时进行拆分和解析,因此不会增加遍历字符串以拆分它然后遍历数组以解析它的额外开销。
我还加入了一个使用任务的并行化版本,但只有在字符串非常大的情况下才会更快。