我在带有 com.amazon.deequ:deequ:2.0.0-spark-3.1 库的 Databricks Notebook 上运行以下命令,以检查输入数据的数据质量,并且在 com.amazon.deequ 的成员的某些函数上收到错误消息。验证运行生成器。isGreaterThanOrEqualTo、hasDataType、hasMinLength 等检查在哪里存在?我确实检查了https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/checks/Check.scala并且它们确实存在于那里。
%scala
import com.amazon.deequ.{VerificationSuite, VerificationResult}
import com.amazon.deequ.VerificationResult.checkResultsAsDataFrame
import com.amazon.deequ.checks.{Check, CheckLevel, CheckStatus}
import com.amazon.deequ.suggestions.{ConstraintSuggestionRunner, Rules}
import com.amazon.deequ.constraints.Constraint;
val verificationResult: VerificationResult = { VerificationSuite()
// data to run the verification on
.onData(df)
// define a data quality check
.addCheck(
Check(CheckLevel.Error, "unitTest")
//.hasSize(_ >= 2) // at least 100 rows
.hasMax("prem_amt", _ <= 2000) // max is 10000
.hasMin("prem_amt", _ >= 1000) // max is 10000
//.hasCompleteness("pol_nbr", _ >= 0.95) // 95%+ non-null IPs
.isNonNegative("prem_amt")) // should not contain negative values
.hasMinLength("pol_nbr", _ <= 8) // minimum length is 8
.hasMaxLength("pol_nbr", _ <= 8) // maximum length is 8
.hasDataType("trans_eff_dt", ConstrainableDataTypes.Date)
.isGreaterThanOrEqualTo("trans_eff_dt","pol_eff_dt")
// compute metrics and verify check conditions
.run()
}
// convert check results to a Spark data frame
val resultDataFrame = checkResultsAsDataFrame(spark, verificationResult)
resultDataFrame.show(truncate=false)
VerificationResult.successMetricsAsDataFrame(spark, verificationResult).show(truncate=false)