1

我在带有 com.amazon.deequ:deequ:2.0.0-spark-3.1 库的 Databricks Notebook 上运行以下命令,以检查输入数据的数据质量,并且在 com.amazon.deequ 的成员的某些函数上收到错误消息。验证运行生成器。isGreaterThanOrEqualTo、hasDataType、hasMinLength 等检查在哪里存在?我确实检查了https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/checks/Check.scala并且它们确实存在于那里。

%scala

import com.amazon.deequ.{VerificationSuite, VerificationResult}
import com.amazon.deequ.VerificationResult.checkResultsAsDataFrame
import com.amazon.deequ.checks.{Check, CheckLevel, CheckStatus}
import com.amazon.deequ.suggestions.{ConstraintSuggestionRunner, Rules}
import com.amazon.deequ.constraints.Constraint;

val verificationResult: VerificationResult = { VerificationSuite()
  // data to run the verification on
  .onData(df)
  // define a data quality check
  .addCheck(
    Check(CheckLevel.Error, "unitTest") 
      //.hasSize(_ >= 2) // at least 100 rows
      .hasMax("prem_amt", _ <= 2000) // max is 10000
      .hasMin("prem_amt", _ >= 1000) // max is 10000    
      //.hasCompleteness("pol_nbr", _ >= 0.95) // 95%+ non-null IPs
      .isNonNegative("prem_amt")) // should not contain negative values
      .hasMinLength("pol_nbr", _ <= 8) // minimum length is 8 
      .hasMaxLength("pol_nbr", _ <= 8) // maximum length is 8  
      .hasDataType("trans_eff_dt", ConstrainableDataTypes.Date)
      .isGreaterThanOrEqualTo("trans_eff_dt","pol_eff_dt")
  // compute metrics and verify check conditions
  .run()
}

// convert check results to a Spark data frame
val resultDataFrame = checkResultsAsDataFrame(spark, verificationResult)

resultDataFrame.show(truncate=false)

VerificationResult.successMetricsAsDataFrame(spark, verificationResult).show(truncate=false)
4

0 回答 0