为了达到我的要求“使用提供的外部库处理提供的数据”,我使用 spark-scala 编写了一个 UDAF,它工作正常,直到我得到如下场景:
TestWindowFunc.scala
import org.apache.spark.sql.SparkSession
object TestWindowFunc {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("TestWindowFunc")
.master("local[3]")
.config("spark.driver.memory", "5g")
.getOrCreate()
spark.udf.register("custAvg", new CustAvg)
val df = spark.read.option("delimiter", "|").option("header", "true")
.csv("./src/main/resources/students_mark.csv")
df.createOrReplaceTempView("testWindowFunc")
val df1 = spark.sql("select X.*" +
", custAvg(ACT_MARK, OUT_OF) over (partition by STUDENT_ID order by ACT_MARK) a" +
", custAvg(ACT_MARK, OUT_OF) over (partition by STUDENT_ID order by ACT_MARK) b" +
" from testWindowFunc X")
df1.show()
}
}
CustAvg.scala
import org.apache.spark.sql.Row
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types.{DataType, IntegerType, LongType, StructField, StructType}
class CustAvg extends UserDefinedAggregateFunction {
var initializeCounter = 0
var updateCounter = 0
override def inputSchema: StructType = StructType(Array(
StructField("act_mark", IntegerType),
StructField("out_of", IntegerType)
)
)
override def bufferSchema: StructType = StructType(Array(
StructField("act_mark_tot", LongType),
StructField("out_of_tot", LongType)
))
override def dataType: DataType = LongType
override def deterministic: Boolean = false
override def initialize(buffer: MutableAggregationBuffer): Unit = {
initializeCounter += 1
println("initialize:::" + initializeCounter)
updateCounter = 0
/**
* initializing the external library for each window
*/
// uncomment the below lines to execute the function
// buffer(0) = 0L
// buffer(1) = 0L
}
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
updateCounter += 1
println("update:::" + updateCounter)
/**
* sending data to the external library for each row of the respective window
*/
// uncomment the below lines to execute the function
// buffer(0) = buffer.getLong(0) + input.getInt(0)
// buffer(1) = buffer.getLong(1) + input.getInt(1)
}
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
throw new Exception("Merge Not Allowed")
}
override def evaluate(buffer: Row): Any = {
println("evaluate:::" + updateCounter)
/**
* calling the external library to process the data
*/
// uncomment the below line to execute the function
// buffer.getLong(0)
}
}
学生标记.csv
STUDENT_ID|ACT_MARK|OUT_OF
1|70|100
1|68|100
1|90|100
预期产出
initialize:::1
update:::1
evaluate:::1
update:::2
evaluate:::2
update:::3
evaluate:::3
initialize:::2
update:::1
evaluate:::1
update:::2
evaluate:::2
update:::3
evaluate:::3
实际输出
initialize:::1
initialize:::2
update:::1
update:::2
evaluate:::2
evaluate:::2
update:::3
update:::4
evaluate:::4
evaluate:::4
update:::5
update:::6
evaluate:::6
evaluate:::6
这是火花在这种情况下的行为方式还是我在这里做错了什么?
有人可以用最合适的解释帮助我吗?
版本详情:
- 标量:2.11
- 火花:2.4.0
提前致谢。