做检查na.drop
函数data-frame
,你可以删除基于空值的行,一行中的最小空值,也可以基于具有空值的特定列。
scala> sc.parallelize(Seq((1,"a","a"),(1,"a","a"),(2,"b","b"),(3,"c","c"),(4,"d","d"),(4,"d",null))).toDF
res7: org.apache.spark.sql.DataFrame = [_1: int, _2: string ... 1 more field]
scala> res7.show()
+---+---+----+
| _1| _2| _3|
+---+---+----+
| 1| a| a|
| 1| a| a|
| 2| b| b|
| 3| c| c|
| 4| d| d|
| 4| d|null|
+---+---+----+
//dropping row if a null is found
scala> res7.na.drop.show()
+---+---+---+
| _1| _2| _3|
+---+---+---+
| 1| a| a|
| 1| a| a|
| 2| b| b|
| 3| c| c|
| 4| d| d|
+---+---+---+
//drops only if `minNonNulls = 3` if accepted to each row
scala> res7.na.drop(minNonNulls = 3).show()
+---+---+---+
| _1| _2| _3|
+---+---+---+
| 1| a| a|
| 1| a| a|
| 2| b| b|
| 3| c| c|
| 4| d| d|
+---+---+---+
//not dropping any
scala> res7.na.drop(minNonNulls = 2).show()
+---+---+----+
| _1| _2| _3|
+---+---+----+
| 1| a| a|
| 1| a| a|
| 2| b| b|
| 3| c| c|
| 4| d| d|
| 4| d|null|
+---+---+----+
//drops row based on nulls in `_3` column
scala> res7.na.drop(Seq("_3")).show()
+---+---+---+
| _1| _2| _3|
+---+---+---+
| 1| a| a|
| 1| a| a|
| 2| b| b|
| 3| c| c|
| 4| d| d|
+---+---+---+