如何配置环境以从 Jupyter 笔记本向 Spark/YARN(客户端模式)提交 PyDeequ 作业。除了使用环境之外,没有全面的解释。如何设置环境以与非 AWS 环境一起使用?
TypeError: 'JavaPackage' object is not callable
如果只是按照示例进行操作,则会导致错误,例如使用 PyDeequ 大规模测试数据质量。
from pydeequ.analyzers import *
analysisResult = AnalysisRunner(spark) \
.onData(df) \
.addAnalyzer(Size()) \
.addAnalyzer(Completeness("review_id")) \
.addAnalyzer(ApproxCountDistinct("review_id")) \
.addAnalyzer(Mean("star_rating")) \
.addAnalyzer(Compliance("top star_rating", "star_rating >= 4.0")) \
.addAnalyzer(Correlation("total_votes", "star_rating")) \
.addAnalyzer(Correlation("total_votes", "helpful_votes")) \
.run()
analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)
analysisResult_df.show()
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/tmp/ipykernel_499599/1388970492.py in <module>
1 from pydeequ.analyzers import *
----> 2 analysisResult = AnalysisRunner(spark) \
3 .onData(df) \
4 .addAnalyzer(Size()) \
5 .addAnalyzer(Completeness("review_id")) \
~/home/repository/git/oonisim/aws/venv/lib/python3.8/site-packages/pydeequ/analyzers.py in onData(self, df)
50 """
51 df = ensure_pyspark_df(self._spark_session, df)
---> 52 return AnalysisRunBuilder(self._spark_session, df)
53
54
~/home/repository/git/oonisim/aws/venv/lib/python3.8/site-packages/pydeequ/analyzers.py in __init__(self, spark_session, df)
122 self._jspark_session = spark_session._jsparkSession
123 self._df = df
--> 124 self._AnalysisRunBuilder = self._jvm.com.amazon.deequ.analyzers.runners.AnalysisRunBuilder(df._jdf)
125
126 def addAnalyzer(self, analyzer: _AnalyzerObject):
TypeError: 'JavaPackage' object is not callable