查看下面的代码。auto-down-unreachable-after
正如医生所说,我已经关闭了该功能。相反,我实现了一个与正常情况有点不同的自定义逻辑。下面代码的关键是如果发生网络分区,只有拥有多数的集群节点会UnreachableMember
在一些可配置的 5 秒后关闭。另一方面,少数集群节点会踩到他们的UnreachableMember
(这是多数派)unreachable
,不要把它们拿下来形成一个孤岛。多数派的想法是从 MongoDB 借来的,我认为这并不新鲜在计算机科学领域。
class ClusterListener extends Actor with ActorLogging {
val cluster = Cluster(context.system)
var unreachableMember: Set[Member] = Set()
// subscribe to cluster changes, re-subscribe when restart
override def preStart(): Unit = {
//#subscribe
cluster.subscribe(self, initialStateMode = InitialStateAsEvents, classOf[UnreachableMember], classOf[ReachableMember])
//#subscribe
}
override def postStop(): Unit = cluster.unsubscribe(self)
def receive = {
case UnreachableMember(member) =>
log.info("Member detected as unreachable: {}", member)
val state = cluster.state
if (isMajority(state.members.size, state.unreachable.size)) {
scheduletakeDown(member)
}
case ReachableMember(member) =>
unreachableMember = unreachableMember - member
case _: MemberEvent => // ignore
case "die" =>
unreachableMember.foreach { member =>
cluster.down(member.address)
}
}
// find out majority number of the group
private def majority(n: Int): Int = (n+1)/2 + (n+1)%2
private def isMajority(total: Int, dead: Int): Boolean = {
require(total > 0)
require(dead >= 0)
(total - dead) >= majority(total)
}
private def scheduletakeDown(member: Member) = {
implicit val dispatcher = context.system.dispatcher
unreachableMember = unreachableMember + member
// make 5s config able!!!
context.system.scheduler.scheduleOnce(5 seconds, self, "die")
}
}