这就是我使用大(50 GB)文件的方式:
我尝试了 2 种不同的方法:第一种,将文件读入内存并使用 Regex Replace 或 String Replace。然后我将整个字符串附加到一个临时文件中。
第一种方法适用于一些正则表达式替换,但如果您在大文件中进行多次替换,Regex.Replace 或 String.Replace 可能会导致内存不足错误。
第二种是逐行读取临时文件并使用 StringBuilder 手动构建每一行并将每个处理的行附加到结果文件中。这种方法非常快。
static void ProcessLargeFile()
{
if (File.Exists(outFileName)) File.Delete(outFileName);
string text = File.ReadAllText(inputFileName, Encoding.UTF8);
// EX 1 This opens entire file in memory and uses Replace and Regex Replace --> might cause out of memory error
text = text.Replace("</text>", "");
text = Regex.Replace(text, @"\<ref.*?\</ref\>", "");
File.WriteAllText(outFileName, text);
// EX 2 This reads file line by line
if (File.Exists(outFileName)) File.Delete(outFileName);
using (var sw = new StreamWriter(outFileName))
using (var fs = File.OpenRead(inFileName))
using (var sr = new StreamReader(fs, Encoding.UTF8)) //use UTF8 encoding or whatever encoding your file uses
{
string line, newLine;
while ((line = sr.ReadLine()) != null)
{
//note: call your own replace function or use String.Replace here
newLine = Util.ReplaceDoubleBrackets(line);
sw.WriteLine(newLine);
}
}
}
public static string ReplaceDoubleBrackets(string str)
{
//note: this replaces the first occurrence of a word delimited by [[ ]]
//replace [[ with your own delimiter
if (str.IndexOf("[[") < 0)
return str;
StringBuilder sb = new StringBuilder();
//this part gets the string to replace, put this in a loop if more than one occurrence per line.
int posStart = str.IndexOf("[[");
int posEnd = str.IndexOf("]]");
int length = posEnd - posStart;
// ... code to replace with newstr
sb.Append(newstr);
return sb.ToString();
}