2

我需要将电子邮件发送与电子邮件退回相匹配,以便我可以找到它们是否已送达。问题是,我必须将退回限制在发送后 4 天内,以消除匹配错误发送到退回。发送记录分布在 30 天内。

LinkedList<event_data> sent = GetMyHugeListOfSends(); //for example 1M+ records
List<event_data> bounced = GetMyListOfBounces(); //for example 150k records

bounced = bounced.OrderBy(o => o.event_date).ToList(); //this ensures the most accurate match of bounce to send (since we find the first match)

List<event_data> delivered = new List<event_data>();
event_data deliveredEmail = new event_data();

foreach (event_data sentEmail in sent)
{

     event_data bounce = bounced.Find(item => item.email.ToLower() == sentEmail.email.ToLower() && (item.event_date > sentEmail.event_date && item.event_date < sentEmail.event_date.AddDays(deliveredCalcDelayDays)));

     //create delivered records
     if (bounce != null)
     {
          //there was a bounce! don't add a delivered record!
     }
     else
     {
          //if sent is not bounced, it's delivered
          deliveredEmail.sid = siteid;
          deliveredEmail.mlid = mlid;
          deliveredEmail.mid = mid;
          deliveredEmail.email = sentEmail.email;
          deliveredEmail.event_date = sentEmail.event_date;
          deliveredEmail.event_status = "Delivered";
          deliveredEmail.event_type = "Delivered";
          deliveredEmail.id = sentEmail.id;
          deliveredEmail.number = sentEmail.number;
          deliveredEmail.laststoretransaction = sentEmail.laststoretransaction;

          delivered.Add(deliveredEmail);   //add the new delivered
          deliveredEmail = new event_data();

          //remove bounce, it only applies to one send!
          bounced.Remove(bounce);
     }

     if (bounced.Count() == 0)
     {
          break; //no more bounces to match!
     }
}

所以我做了一些测试,它每秒处理大约 12 条发送记录。超过 100 万条记录,需要 25 多个小时来处理!

两个问题:

  1. 我怎样才能找到花费最多时间的确切线路?
  2. 我假设是 lambda 表达式找到了最长时间的反弹,因为在我把它放在那里之前这要快得多。我怎样才能加快速度?

谢谢!

编辑

---想法---

  1. 我刚刚想到的一个想法是按日期对发送进行排序,就像我对退回进行排序一样,这样通过退回的搜索将更加有效,因为早期发送也可能会遇到早期退回。
  2. 我刚刚想到的另一个想法是并行运行几个这样的进程,尽管我不喜欢多线程这个简单的应用程序。
4

7 回答 7

4

我会相当有信心地说,是的,你的发现需要时间。

看起来您确定 find 方法将仅返回 0 或 1 条记录(而不是列表),在这种情况下,加快此速度的方法是创建一个查找(字典)而不是List<event_data>为您的退回变量创建一个,而是创建一个Dictionary<key, event_data>,然后您可以通过键查找值而不是进行查找。

诀窍在于创建您的密钥(我对您的应用程序了解不多,无法提供帮助),但与您找到的标准基本相同。

编辑。(添加一些伪代码)

void Main()
{
    var hugeListOfEmails = GetHugeListOfEmails();
    var allBouncedEmails = GetAllBouncedEmails();
    IDictionary<string, EmailInfo> CreateLookupOfBouncedEmails = CreateLookupOfBouncedEmails(allBouncedEmails);

    foreach(var info in hugeListOfEmails)
    {
        if(CreateLookupOfBouncedEmails.ContainsKey(info.emailAddress))
        {
            // Email is bounced;
        }
        else
        {
            // Email is not bounced
        }
    }

}

public IEnumerable<EmailInfo> GetHugeListOfEmails()
{
    yield break;
}

public IEnumerable<EmailInfo> GetAllBouncedEmails()
{
    yield break;
}

public IDictionary<string, EmailInfo> CreateLookupOfBouncedEmails(IEnumerable<EmailInfo> emailList)
{
    var result = new Dictionary<string, EmailInfo>();
    foreach(var e in emailList)
    {
        if(!result.ContainsKey(e.emailAddress))
        {
            if(//satisfies the date conditions)
            {
                result.Add(e.emailAddress, e);
            }
        }
    }
    return result;
}

public class EmailInfo
{
    public string emailAddress { get; set; }
    public DateTime DateSent { get; set; }
}
于 2013-05-23T02:44:09.927 回答
1

您应该通过使用ToLookup方法为电子邮件地址创建查找表来改进

var bouncedLookup = bounced.ToLookup(k => k.email.ToLower());

并在循环中使用它首先通过电子邮件查找

var filteredBounced = bouncedLookup[sent_email.email.ToLower()];
// mini optimisation here
var endDate = sentEmail.event_date.AddDays(deliveredCalcDelayDays);
event_data bounce = filteredBounced.Find(item => item.event_date > sentEmail.event_date && item.event_date < endDate));

我无法编译它,但我认为应该这样做。请尝试一下。

于 2013-05-23T02:57:01.143 回答
0

将退回的转换为排序列表可能是一个很好的解决方案

SortedList<string,data> sl = new SortedList<string,event_data>(bounced.ToDictionary(s=>s.email,s=>s));
and to find a bounce use
sl.Select(c=>c.Key.Equals(item => item.email,StringComparison.OrdinalIgnoreCase) && ...).FirstOrDefault();
于 2013-05-23T07:20:13.253 回答
0

您正在列表中查找项目。这意味着它必须遍历整个列表,因此它是一个 order (n) 操作。您能否将那些发送的电子邮件存储在字典中,关键字是您正在搜索的电子邮件地址。通过跳出链接回字典中的电子邮件。查找将是恒定时间,并且您将经历反弹,因此总体上将是顺序 (n)。您当前的方法是订单(n 平方)

于 2013-05-23T07:45:33.480 回答
0

考虑到,反弹的次数相对较少,所以,

为什么不尽可能地预先优化退回邮件查找,此代码为每个可能的退回邮件创建一个委托,并将它们分组到一个字典中,以便通过电子邮件键访问。

private static DateInRange(
    DateTime sendDate,
    DateTime bouncedDate,
    int deliveredCalcDelayDays)
{
    if (sentDate < bouncedDate)
    {
        return false;
    }

    return sentDate < bouncedDate.AddDays(deliveredCalcDelayDays);
}

static IEnumerable<event_data> GetDeliveredMails(
           IEnumerable<event_data> sent,
           IEnumerable<event_data> bounced,
           int siteId,
           int mlId,
           int mId,
           int deliveredCalcDelayDays)
{
    var grouped = bounced.GroupBy(
        b => b.email.ToLowerInvariant());

    var lookup = grouped.ToDictionary(
        g => g.Key,
        g => g.OrderBy(e => e.event_date).Select(
            e => new Func<DateTime, bool>(
                s => DateInRange(s, e.event_date, deliveredCalcDelayDays))).ToList());

    foreach (var s in sent)
    {
        var key = s.email.ToLowerInvariant();

        List<Func<DateTime, nool>> checks;
        if (lookup.TryGetValue(key, out checks))
        {
            var match = checks.FirstOrDefault(c => c(s.event_date));
            if (match != null)
            {
                checks.Remove(match);
                continue;
            }
        }

        yield return new event_data
            {
                .sid = siteid;
                .mlid = mlid;
                .mid = mid;
                .email = s.email;
                .event_date = s.event_date;
                .event_status = "Delivered";
                .event_type = "Delivered";
                .id = s.id;
                .number = s.number;
                .laststoretransaction = s.laststoretransaction
            };
    }

}

如果速度不够快,您可以尝试在查找中预编译委托。

于 2013-05-23T09:09:05.147 回答
0

我想指出您的代码还有另一个问题。

内存消耗。我不知道你的机器配置,但这里有一些关于代码的想法:

  1. 最初,您为 1,2M+event_data 类型的对象分配空间。我看不到event_data完整的类型定义,但假设电子邮件都是唯一的并且看到该类型具有相当多的属性,我可以假设这样的集合相当 (可能有数百 Meg)。
  2. 接下来,您将分配另一组event_data对象(如果我算对的话,几乎是 1M)。 它在内存消耗方面变得更加沉重
  3. 我不知道您的应用程序的数据模型中存在的其他对象,但考虑到我提到的所有内容,您可以轻松接近 32 位进程的内存限制,从而迫使 GC经常工作. 事实上,您可以在每次调用后轻松收集 GC bounced.Remove(bounce);,这确实会显着降低您的应用程序的速度。

因此,即使您有足够的内存和/或您的应用程序是 64 位的,我也会尽量减少内存消耗。很确定它会让你的代码运行得更快。例如,您可以在不存储的情况下对 进行完整处理,或者以块deliveredEmail的形式加载初始值等。event_data

于 2013-05-23T09:55:52.387 回答
0

好的,我找到的最终解决方案是反弹字典。

发送的 LinkedList 按 sent_date 排序,因此它会按时间顺序循环。这很重要,因为我必须将正确的发送与正确的反弹相匹配。

我做了一个Dictionary<string,<List<event_data>>,所以键是电子邮件,值是<event_data>电子邮件地址的所有退回列表。该列表按 event_date 排序,因为我想确保第一次反弹与发送匹配。

最终结果......它从处理 700 条记录/分钟变为 500k+ 条记录/秒。

这是最终代码:

LinkedList 发送 = GetMyHugeListOfSends(); IEnumerable sentOrdered = sent.OrderBy(send => send.event_date);

Dictionary> 弹跳 = GetMyListOfBouncesAsDictionary();

列表交付 = new List(); event_data DeliveredEmail = new event_data();

列表反弹 = null; 布尔匹配弹跳=假;

foreach (event_data sentEmail in sentOrdered) {matchedBounce = false;

 //create delivered records
 if (bounced.TryGetValue(sentEmail.email, out bounces))
 {
      //there was a bounce! find out if it was within 4 days after the send!
      foreach (event_data bounce in bounces)
      {
           if (bounce.event_date > sentEmail.event_date &&
               bounce.event_date <= sentEmail.event_date.AddDays(4))
           {
               matchedBounce = true;

               //remove the record because a bounce can only match once back to a send
               bounces.Remove(bounce);

               if(bounces.Count == 0) //no more bounces for this email
               {
                    bounced.Remove(sentEmail.email);
               }

               break;
          }
     }

     if (matchedBounce == false) //no matching bounces in the list!
     {
          //if sent is not bounced, it's delivered
          deliveredEmail.sid = siteid;
          deliveredEmail.mlid = mlid;
          deliveredEmail.mid = mid;
          deliveredEmail.email = sentEmail.email;
          deliveredEmail.event_date = sentEmail.event_date;
          deliveredEmail.event_status = "Delivered";
          deliveredEmail.event_type = "Delivered";
          deliveredEmail.id = sentEmail.id;
          deliveredEmail.number = sentEmail.number;
          deliveredEmail.laststoretransaction = sentEmail.laststoretransaction;

          delivered.Add(deliveredEmail);   //add the new delivered
          deliveredEmail = new event_data();
     }
 }
 else
 {
      //if sent is not bounced, it's delivered
      deliveredEmail.sid = siteid;
      deliveredEmail.mlid = mlid;
      deliveredEmail.mid = mid;
      deliveredEmail.email = sentEmail.email;
      deliveredEmail.event_date = sentEmail.event_date;
      deliveredEmail.event_status = "Delivered";
      deliveredEmail.event_type = "Delivered";
      deliveredEmail.id = sentEmail.id;
      deliveredEmail.number = sentEmail.number;
      deliveredEmail.laststoretransaction = sentEmail.laststoretransaction;

      delivered.Add(deliveredEmail);   //add the new delivered
      deliveredEmail = new event_data();
 }

 if (bounced.Count() == 0)
 {
      break; //no more bounces to match!
 }
}
于 2013-05-23T22:25:50.520 回答