1

我有一个功能,基本上分为两个子功能。

html=RetriveHTML(int index);
returnColection = RegexProcess(html, index);

通过优化 RetrieveHTML 并行化来加速此过程的最佳方法是什么?

通常我用多达 20000 个索引来调用它。第一个子功能依赖于网络(使用 webclient.downloadstring 从一个服务器获取多个 URLs HTML),第二个子功能主要是 CPU。

我迷失在 Parallel foreach 和 Tasks(continue with, continueall, fromasync) 世界中,我很难找到解决方案。我首先尝试 Parallel foreach 是因为它的简单性,但我发现它的性能,即网络 I/O 会随着连续调用而下降(第一个循环很快,其他循环变慢)。该解决方案将在处理 html 对象时释放它们,因为它们很多且很大。我正在使用.net 4.0 ...

4

1 回答 1

0

你好,你可以试试下面的代码

private Regex _regex= new Regex("net");

        private void ProcessInParallell()
        {
           Uri[] resourceUri = new Uri[]{new Uri("http://www.microsoft.com"),new Uri( "http://www.google.com"),new Uri( "http://www.amazon.com") };


            //1. Stage 1: Download HTML

            //Use the blocking collection for concurrent tasks
            BlockingCollection<string> htmlDataList = new BlockingCollection<string>();

            Parallel.For(0, resourceUri.Length , index =>
                {   var html = RetrieveHTML(resourceUri[index]);

                    htmlDataList.TryAdd(html);

                    //If we reach to the last index, signal the completion
                    if (index == (resourceUri.Length - 1))
                    {   
                        htmlDataList.CompleteAdding();
                    }

                });


            //2. Get matches

            //This concurrent bags will be used to store the result of the matching stage
            ConcurrentBag<string> matchedHtml = new ConcurrentBag<string>();

            IList<Task> processingTasks  = new List<Task>();

            //Enumerate through each downloaded HTML document
            foreach (var html  in htmlDataList.GetConsumingEnumerable())
            {
                //Create a new task to match the downloaded HTML
              var task=   Task.Factory.StartNew((data) =>
                    {
                        var downloadedHtml = data as string;
                        if(downloadedHtml ==null)
                            return;

                        if (_regex.IsMatch(downloadedHtml))
                        {

                            matchedHtml.Add(downloadedHtml);
                        }

                    }, 
                     html  
                    );

                //Add the task to the waiting list
                processingTasks.Add(task);

            }

            //wait for the all tasks to complete
            Task.WaitAll(processingTasks.ToArray());



            foreach (var html in matchedHtml)
            {
                //Do something with the matched result    

            }




        }

        private string  RetrieveHTML(Uri uri)
        {
           using (WebClient webClient = new WebClient())
           {
               //set this to null if there is no proxy
               webClient.Proxy = null;

               byte[] data =webClient.DownloadData(uri);

               return Encoding.UTF8.GetString(data);
           }
        }
于 2013-01-23T00:29:13.443 回答