2

我正在从我的硬盘驱动器上的文本文件中读取 IMDB 电影列表(最初可从ftp://ftp.fu-berlin.de/pub/misc/movies/database/movies.list.gz的 IMDB 站点获得)。

在我的机器(基本信息:Win7 x64bit,16GB RAM,500 GB SATA Hardisk 7200 RPM)上使用下面的代码逐行读取此文件大约需要 5 分钟。

我有两个问题:

  1. 有什么方法可以优化代码以提高读取时间?

  2. 数据访问不需要是顺序的,因为我不介意从上到下/从下到上或任何顺序读取数据,只要它一次读取一行。我想知道有没有办法从多个方向阅读以提高阅读时间?

该应用程序是 Windows 控制台应用程序。

更新:许多回复正确地指出写入控制台需要大量时间。考虑到在 Windows 控制台上显示数据现在是可取的但不是强制性的。

//代码块

string file = @"D:\movies.list";

FileStream fs = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.None, 8, FileOptions.None);

using (StreamReader sr = new StreamReader(fs))
{
  while (sr.Peek() >= 0)
  {
    Console.WriteLine(sr.ReadLine());
  }
}
4

5 回答 5

0

首先,如果您不关心将列表打印到控制台,请编辑您的问题。

其次,我创建了一个计时程序来测试建议的不同方法的速度:

class Program
{
    private static readonly string file = @"movies.list";

    private static readonly int testStart = 1;
    private static readonly int numOfTests = 2;
    private static readonly int MinTimingVal = 1000;

    private static string[] testNames = new string[] {            
        "Naive",
        "OneCallToWrite",
        "SomeCallsToWrite",
        "InParallel",
        "InParallelBlcoks",
        "IceManMinds",
        "TestTiming"
        };

    private static double[] avgSecs = new double[numOfTests];

    private static int[] testIterations = new int[numOfTests];

    public static void Main(string[] args)
    {
        Console.WriteLine("Starting tests...");
        Debug.WriteLine("Starting tests...");

        Console.WriteLine("");
        Debug.WriteLine("");

        //*****************************
        //The console is the bottle-neck, so we can
        //speed-up redrawing it by only showing 1 line at a time.
        Console.WindowHeight = 1;
        Console.WindowWidth = 50;

        Console.BufferHeight = 100;
        Console.BufferWidth = 50;
        //******************************

        Action[] actionArray = new Action[numOfTests];

        actionArray[0] = naive;
        actionArray[1] = oneCallToWrite;
        actionArray[2] = someCallsToWrite;
        actionArray[3] = inParallel;
        actionArray[4] = inParallelBlocks;
        actionArray[5] = iceManMinds;
        actionArray[6] = testTiming;


        for (int i = testStart; i < actionArray.Length; i++)
        {
            Action a = actionArray[i];
            DoTiming(a, i);
        }

        printResults();

        Console.WriteLine("");
        Debug.WriteLine("");

        Console.WriteLine("Tests complete.");
        Debug.WriteLine("Tests complete.");

        Console.WriteLine("Press Enter to Close Console...");
        Debug.WriteLine("Press Enter to Close Console...");

        Console.ReadLine();
    }

    private static void DoTiming(Action a, int num)
    {
        a.Invoke();

        Stopwatch watch = new Stopwatch();
        Stopwatch loopWatch = new Stopwatch();

        bool shouldRetry = false;

        int numOfIterations = 2;

        do
        {
            watch.Start();

            for (int i = 0; i < numOfIterations; i++)
            {
                a.Invoke();
            }

            watch.Stop();

            shouldRetry = false;

            if (watch.ElapsedMilliseconds < MinTimingVal) //if the time was less than the minimum, increase load and re-time.
            {
                shouldRetry = true;
                numOfIterations *= 2;
                watch.Reset();
            }

        } while (shouldRetry);

        long totalTime = watch.ElapsedMilliseconds;

        double avgTime = ((double)totalTime) / (double)numOfIterations;

        avgSecs[num] = avgTime / 1000.00;
        testIterations[num] = numOfIterations;
    }

    private static void printResults()
    {
        Console.WriteLine("");
        Debug.WriteLine("");

        for (int i = testStart; i < numOfTests; i++)
        {
            TimeSpan t = TimeSpan.FromSeconds(avgSecs[i]);

            Console.WriteLine("ElapsedTime: {0:N4}, " + "test: " + testNames[i], t.ToString() );
            Debug.WriteLine("ElapsedTime: {0:N4}, " + "test: " + testNames[i], t.ToString() );
        }
    }

    public static void naive()
    {
        FileStream fs = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.None, 8, FileOptions.None);

        using (StreamReader sr = new StreamReader(fs))
        {
            while (sr.Peek() >= 0)
            {
                 Console.WriteLine( sr.ReadLine() );

            }
        }
    }

    public static void oneCallToWrite()
    {
        FileStream fs = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.None, 8, FileOptions.None);

        using (StreamReader sr = new StreamReader(fs))
        {
            StringBuilder sb = new StringBuilder();

            while (sr.Peek() >= 0)
            {
                string s = sr.ReadLine();

                sb.Append("\n" + s);
            }

            Console.Write(sb);
        }
    }

    public static void someCallsToWrite()
    {
        FileStream fs = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.None, 8, FileOptions.None);

        using (StreamReader sr = new StreamReader(fs))
        {
            StringBuilder sb = new StringBuilder();
            int count = 0;
            int mod = 10000;

            while (sr.Peek() >= 0)
            {
                count++;

                string s = sr.ReadLine();

                sb.Append("\n" + s);

                if (count % mod == 0)
                {
                    Console.Write(sb);
                    sb = new StringBuilder();
                }
            }

            Console.Write( sb );
        }
    }

    public static void inParallel()
    {
        string[] wordsFromFile = File.ReadAllLines( file );

        int length = wordsFromFile.Length;

        Parallel.For( 0, length, i => {

            Console.WriteLine( wordsFromFile[i] );

        });

    }

    public static void inParallelBlocks()
    {
        string[] wordsFromFile = File.ReadAllLines(file);

        int length = wordsFromFile.Length;

        Parallel.For<StringBuilder>(0, length,
            () => { return new StringBuilder(); },
            (i, loopState, sb) =>
            {
                sb.Append("\n" + wordsFromFile[i]);
                return sb;
            },
            (x) => { Console.Write(x); }
        );

    }

    #region iceManMinds

    public static void iceManMinds()
    {
        string FileName = file;
        long ThreadReadBlockSize = 50000;
        int NumberOfThreads = 4;
        byte[] _inputString;


        var fi = new FileInfo(FileName);
        long totalBytesRead = 0;
        long fileLength = fi.Length;
        long readPosition = 0L;
        Console.WriteLine("Reading Lines From {0}", FileName);
        var threads = new Thread[NumberOfThreads];
        var instances = new ReadThread[NumberOfThreads];
        _inputString = new byte[fileLength];

        while (totalBytesRead < fileLength)
        {
            for (int i = 0; i < NumberOfThreads; i++)
            {
                var rt = new ReadThread { StartPosition = readPosition, BlockSize = ThreadReadBlockSize };
                instances[i] = rt;
                threads[i] = new Thread(rt.Read);
                threads[i].Start();
                readPosition += ThreadReadBlockSize;
            }
            for (int i = 0; i < NumberOfThreads; i++)
            {
                threads[i].Join();
            }
            for (int i = 0; i < NumberOfThreads; i++)
            {
                if (instances[i].BlockSize > 0)
                {
                    Array.Copy(instances[i].Output, 0L, _inputString, instances[i].StartPosition,
                               instances[i].BlockSize);
                    totalBytesRead += instances[i].BlockSize;
                }
            }
        }

        string finalString = Encoding.ASCII.GetString(_inputString);
        Console.WriteLine(finalString);//.Substring(104250000, 50000));
    }

    private class ReadThread
    {
        public long StartPosition { get; set; }
        public long BlockSize { get; set; }
        public byte[] Output { get; private set; }

        public void Read()
        {
            Output = new byte[BlockSize];
            var inStream = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.ReadWrite);
            inStream.Seek(StartPosition, SeekOrigin.Begin);
            BlockSize = inStream.Read(Output, 0, (int)BlockSize);
            inStream.Close();
        }
    }

    #endregion

    public static void testTiming()
    {
        Thread.Sleep(500);
    }
}

这些测试中的每一个都将文件打印到控制台。

在默认控制台设置下运行时,每个测试都在 5:30 到 6:10(Min:Sec)之间进行。

考虑了Console属性后,通过使Console.WindowHeight = 1,即一次只显示1行,(可以上下滚动查看最近的100行),实现了提速。

目前,对于大多数方法,该任务只需 2:40 多一点(Min:Sec)即可完成。

在您的计算机上尝试一下,看看它如何为您工作。

有趣的是,不同的方法基本上是等效的,OP 的代码基本上是最快的。

计时代码预热代码然后运行它两次并平均它所花费的时间,它为每个方法执行此操作。

随意尝试自己的方法并计时。

于 2012-07-11T23:15:08.230 回答
0

这个问题的答案实际上取决于您将如何处理数据。如果您的意图确实只是读入文件并将内容转储到控制台屏幕,那么最好使用StringBuilder 类构建一个字符串,例如 1000 行,然后将内容转储到屏幕,重置然后将字符串读入另外 1000 行,转储它们,等等……

但是,如果您正在尝试构建属于较大项目的一部分并且使用 .NET 4.0,则可以使用MemoryMappedFile 类来读取文件并创建CreateViewAccessor来创建仅在一部分上运行的“窗口”数据而不是读取整个文件。

另一种选择是让线程一次读取文件的不同部分,然后最后将它们放在一起。

如果您可以更具体地说明您打算如何处理这些数据,我可以为您提供更多帮助。希望这可以帮助!

编辑:

试试这个代码。使用 Threads,我能够在 3 秒内阅读整个列表:

using System;
using System.IO;
using System.Text;
using System.Threading;

namespace ConsoleApplication36
{
    class Program
    {
        private const string FileName = @"C:\Users\Public\movies.list";
        private const long ThreadReadBlockSize = 50000;
        private const int NumberOfThreads = 4;
        private static byte[] _inputString;

        static void Main(string[] args)
        {
            var fi = new FileInfo(FileName);
            long totalBytesRead = 0;
            long fileLength = fi.Length;
            long readPosition = 0L;
            Console.WriteLine("Reading Lines From {0}", FileName);
            var threads = new Thread[NumberOfThreads];
            var instances = new ReadThread[NumberOfThreads];
            _inputString = new byte[fileLength];

            while (totalBytesRead < fileLength)
            {
                for (int i = 0; i < NumberOfThreads; i++)
                {
                    var rt = new ReadThread { StartPosition = readPosition, BlockSize = ThreadReadBlockSize };
                    instances[i] = rt;
                    threads[i] = new Thread(rt.Read);
                    threads[i].Start();
                    readPosition += ThreadReadBlockSize;
                }
                for (int i = 0; i < NumberOfThreads; i++)
                {
                    threads[i].Join();
                }
                for (int i = 0; i < NumberOfThreads; i++)
                {
                    if (instances[i].BlockSize > 0)
                    {
                        Array.Copy(instances[i].Output, 0L, _inputString, instances[i].StartPosition,
                                   instances[i].BlockSize);
                        totalBytesRead += instances[i].BlockSize;
                    }
                }
            }

            string finalString = Encoding.ASCII.GetString(_inputString);
            Console.WriteLine(finalString.Substring(104250000, 50000));
        }

        private class ReadThread
        {
            public long StartPosition { get; set; }
            public long BlockSize { get; set; }
            public byte[] Output { get; private set; }

            public void Read()
            {
                Output = new byte[BlockSize];
                var inStream = new FileStream(FileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite);
                inStream.Seek(StartPosition, SeekOrigin.Begin);
                BlockSize = inStream.Read(Output, 0, (int)BlockSize);
                inStream.Close();
            }
        }
    }
}

您需要更改 FileName 以匹配您的 movies.list 文件的位置。此外,您可以调整线程总数。我用了4,但你可以随意减少或增加。您还可以更改块大小...这是每个线程读取的数据量。另外,我假设它是一个 ASCII 文本文件。如果不是,您需要将编码类型更改为 UTF8 或文件​​所在的任何编码。祝您好运!

于 2012-07-07T05:46:58.460 回答
0

我不确定这是否更有效,但另一种方法是使用File.ReadAllLines

var movieFile = File.ReadAllLines(file);
foreach (var movie in movieFile)
    Console.WriteLine(movie);
于 2012-07-07T04:28:49.137 回答
0

在 .net 4 中,您可以使用 File.ReadLines 进行延迟评估,从而在处理大文件时降低 RAM 使用率。

您可以直接对文件执行 linq 操作,这与 File.ReadLines 一起可以缩短加载时间。

为了更好地理解你可以检查,Read text file word-by-word using LINQ

您也可以进行比较,但要设置时间间隔。

但是,如果您制作 Web 应用程序,您可以在应用程序启动事件中读取整个文件并将它们缓存在应用程序池中以获得更好的性能。

于 2012-07-07T04:38:42.337 回答
0

我不是 ac# 开发人员,但是如何使用文件(这将是一次)批量插入数据库。然后,您也可以重用数据并导出。

于 2012-07-07T05:05:25.500 回答