spark As 可以是列子查询吗?
原因:java.lang.RuntimeException:在 [storeid#4,combox_pid#6,pid#7,count(1)#61L] 中找不到 count(DISTINCT orderid)#69L
select b.pid, (select count(distinct orderid) from a where a.pid=b.pid) as order_num,count(1) from b as b group by b.pid
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, SparkSession}
object OrderCountTset {
Logger.getRootLogger.setLevel(Level.WARN)
def main(args: Array[String]): Unit = {
val data = Seq(
Row("a", "100", "200", "300"),
Row("a", "100", "200", "300"),
Row("a", "101", "201", "300"),
Row("a", "101", "200", "3001"),
Row("a", "102", "200", "300"),
Row("a", "103", "201", "300")
)
val schme = new StructType()
.add("storeid", StringType)
.add("orderid", StringType)
.add("combox_pid", StringType)
.add("pid", StringType)
val spark = SparkSession.builder().master("local[*]").getOrCreate()
val df = spark.createDataFrame(spark.sparkContext.parallelize(data), schme)
df.show()
df.createOrReplaceTempView("tab_tmp")
spark.sql("select storeid,orderid,combox_pid,pid,count(distinct orderid ),count(1) from tab_tmp group by storeid,orderid,combox_pid,pid").show()
// spark.sql(
// """
// |select b.storeid,b.combox_pid,b.pid,
// |(select count(distinct a.orderid) from tab_tmp as a where a.pid=b.pid) as order_num,
// |count(1)
// | from tab_tmp as b group by b.storeid,b.combox_pid,b.pid
// """.stripMargin).show()
spark.sql(
"""
|select storeid,combox_pid,pid,count(1) as num
| from tab_tmp group by storeid,combox_pid,pid
""".stripMargin).createOrReplaceTempView("tab_tmp_2")
spark.sql(
"""
|select b.storeid,b.combox_pid,b.pid,num,
|(select count(distinct a.orderid) from tab_tmp as a where a.pid=b.pid) as order_num
| from tab_tmp_2 as b
""".stripMargin).show()
}
}```