我可以使用字符串索引器和一个热编码器在最右侧创建特征列。注意 id 1 有多行。我想知道如何使用管道或其他替代方法聚合特征中的稀疏向量,以便 id 1 = (7,[0,3,5],[1.0,1.0,1.0]) 的特征。
我想接受这个输入:
+---+------+----+-----+
| id|houses|cars|label|
+---+------+----+-----+
| 0| M| A| 1.0|
| 1| M| C| 1.0|
| 1| M| B| 1.0|
| 2| F| A| 0.0|
| 3| F| D| 0.0|
| 4| Z| B| 1.0|
| 5| Z| C| 0.0|
+---+------+----+-----+
然后对房屋列、汽车列进行热编码,将它们组合起来并按 id 聚合
并生成此输出:
+-------------------+
| features|
+-------------------+
|(7,[0,4],[1.0,1.0])|
|(7,[0,3,5],[1.0,1.0,1.0])|
|(7,[2,4],[1.0,1.0])|
|(7,[2,6],[1.0,1.0])|
|(7,[1,3],[1.0,1.0])|
|(7,[1,5],[1.0,1.0])|
+-------------------+
def oneHotEncoderExample(sqlContext: SQLContext): Unit = {
// define data
val df = sqlContext.createDataFrame(Seq(
(0, "M", "A", 1.0),
(1, "M", "C", 1.0),
(1, "M", "B", 1.0),
(2, "F", "A", 0.0),
(3, "F", "D", 0.0),
(4, "Z", "B", 1.0),
(5, "Z", "C", 0.0)
)).toDF("id", "houses", "cars", "label")
df.show()
// define stages of pipeline
val indexerHouse = new StringIndexer()
.setInputCol("houses")
.setOutputCol("housesIndex")
val encoderHouse = new OneHotEncoder()
.setDropLast(false)
.setInputCol("housesIndex")
.setOutputCol("typeHouses")
val indexerCar = new StringIndexer()
.setInputCol("cars")
.setOutputCol("carsIndex")
val encoderCar = new OneHotEncoder()
.setDropLast(false)
.setInputCol("carsIndex")
.setOutputCol("typeCars")
val assembler = new VectorAssembler()
.setInputCols(Array("typeHouses", "typeCars"))
.setOutputCol("features")
val lr = new LogisticRegression()
.setMaxIter(10)
.setRegParam(0.01)
// define pipeline
val pipeline = new Pipeline()
.setStages(Array(
indexerHouse, encoderHouse,
indexerCar, encoderCar,
assembler, lr))
// Fit the pipeline to training documents.
val pipelineModel = pipeline.fit(df)
}
// helper code to simulate and aggregate current pipeline (generates table below)
val indexedHouse = indexerHouse.fit(df).transform(df)
indexedHouse.show()
val encodedHouse = encoderHouse.transform(indexedHouse)
encodedHouse.show()
val indexedCar = indexerCar.fit(df).transform(df)
indexedCar.show()
val encodedCar = encoderCar.transform(indexedCar)
encodedCar.show()
val assembledFeature = assembler.transform(encodedHouse.join(encodedCar, usingColumns = Seq("id", "houses", "cars")))
assembledFeature.show()