我试图在这篇文章中尽可能彻底,因为这对我来说非常重要,
虽然这个问题很简单,只有阅读这个问题的标题,你才能明白......
问题是:
提供健康的带宽(30mb Vdsl)...
httpWebRequest
如何为单个数据/文件获取多个?,
因此,每个请求将仅下载一部分数据, 然后当所有实例完成后,所有部分都将重新合并为一个部分。
代码:
...到目前为止我所做的工作是相同的想法,只有每个任务 =HttpWebRequest = 不同的文件,
所以加速是纯粹的任务并行性,而不是使用多个任务/线程加速一次下载
就像我的问题一样。
见下面的代码
下一部分只是对该主题的更详细的解释和背景……如果您不介意阅读的话。
虽然我仍然在一个与这个(有问题的)不同的类似项目中,
它(参见下面的代码..)试图为每个单独的任务(不同的下载/文件)获取尽可能多的不同数据源。...因此,在每个(任务)不必等待前一个完成之前,它就有机会被执行,从而获得了加速。
我在这个当前主题的问题中想要做的(在下面的代码中准备好所有东西)实际上是 针对相同数据的相同 url,所以这次获得的加速是针对单任务当前下载。
实现与下面的代码相同的想法,只是这次让使用多个实例来SmartWebClient
定位相同的 url 。
然后(目前只是理论)它将请求数据的部分内容,每个实例都有多个请求。
最后一个问题是我需要“让谜题回归平静”......我需要了解的另一个问题......
正如您在这段代码中看到的那样,我还没有开始工作的只是我发现使用起来非常容易的数据解析/处理,htmlAgilityPack
所以没问题。
当前代码
主要条目:
var htmlDictionary = urlsForExtraction.urlsConcrDict();
Parallel.ForEach(
urlList.Values,
new ParallelOptions { MaxDegreeOfParallelism = 20 },
url => Download(url, htmlDictionary)
);
foreach (var pair in htmlDictionary)
{
///Process(pair);
MessageBox.Show(pair.Value);
}
public class urlsForExtraction
{
const string URL_Dollar= "";
const string URL_UpdateUsersTimeOut="";
public ConcurrentDictionary<string, string> urlsConcrDict()
{
//need to find the syntax to extract fileds names so it would be possible to iterate on each instead of specying
ConcurrentDictionary<string, string> retDict = new Dictionary<string,string>();
retDict.TryAdd("URL_Dollar", "Any.Url.com");
retDict.TryAdd("URL_UpdateUserstbl", "http://bing.com");
return retDict;
}
}
/// <summary>
/// second Stage Class consumes the Dictionary of urls for extraction
/// then downloads Each via parallel for each using The Smart WeBClient! (download(); )
/// </summary>
public class InitConcurentHtmDictExtrct
{
private void Download(string url, ConcurrentDictionary<string, string> htmlDictionary)
{
using (var webClient = new SmartWebClient())
{
webClient.Encoding = Encoding.GetEncoding("UTF-8");
webClient.Proxy = null;
htmlDictionary.TryAdd(url, webClient.DownloadString(url));
}
}
private ConcurrentDictionary<string, string> htmlDictionary;
public ConcurrentDictionary<string, string> LoopOnUrlsVia_SmartWC(Dictionary<string, string> urlList)
{
htmlDictionary = new ConcurrentDictionary<string, string>();
Parallel.ForEach(
urlList.Values,
new ParallelOptions { MaxDegreeOfParallelism = 20 },
url => Download(url, htmlDictionary)
);
return htmlDictionary;
}
}
/// <summary>
/// the Extraction Process, done via "HtmlAgility pack"
/// easy usage to collect information within a given html Documnet via referencing elements attributes
/// </summary>
public class Results
{
public struct ExtracionParameters
{
public string FileNameToSave;
public string directoryPath;
public string htmlElementType;
}
public enum Extraction
{
ById, ByClassName, ByElementName
}
public void ExtractHtmlDict( ConcurrentDictionary<string, string> htmlResults, Extract By)
{
// helps with easy elements extraction from the page.
HtmlAttribute htAgPcAttrbs;
HtmlDocument HtmlAgPCDoc = new HtmlDocument();
/// will hold a name+content of each documnet-part that was aventually extracted
/// then from this container the build of the result page will be possible
Dictionary<string, HtmlDocument> dictResults = new Dictionary<string, HtmlDocument>();
foreach (KeyValuePair<string, string> htmlPair in htmlResults)
{
Process(htmlPair);
}
}
private static void Process(KeyValuePair<string, string> pair)
{
// do the html processing
}
}
public class SmartWebClient : WebClient
{
private readonly int maxConcurentConnectionCount;
public SmartWebClient(int maxConcurentConnectionCount = 20)
{
this.Proxy = null;
this.Encoding = Encoding.GetEncoding("UTF-8");
this.maxConcurentConnectionCount = maxConcurentConnectionCount;
}
protected override WebRequest GetWebRequest(Uri address)
{
var httpWebRequest = (HttpWebRequest)base.GetWebRequest(address);
if (httpWebRequest == null)
{
return null;
}
if (maxConcurentConnectionCount != 0)
{
httpWebRequest.ServicePoint.ConnectionLimit = maxConcurentConnectionCount;
}
return httpWebRequest;
}
}
}
这使我可以利用良好的带宽,只有我离解决方案还很远,我会真正了解从哪里开始的任何线索。