下面是我的 RandomForest 多类分类模型的代码。我正在读取 CSV 文件并进行代码中所见的各种转换。
我正在计算最大类别数,然后将其作为参数提供给 RF。这需要很多时间!是否有要设置的参数或更简单的方法让模型自动推断最大类别?因为它可以超过 1000 个,我不能省略它们。
我如何处理新数据上看不见的标签以进行预测,因为 StringIndexer 在这种情况下不起作用。下面的代码只是数据的一部分,但我将来也会引入新数据
// Need to predict 2 classes val cols_to_predict=Array("Label1","Label2") // ID col val omit_cols=Array("Key") // reading the csv file val data = sqlContext.read .format("com.databricks.spark.csv") .option("header", "true") // Use first line of all files as header .option("inferSchema", "true") // Automatically infer data types .load("abc.csv") .cache() // creating a features DF by droppping the labels so that I can run all // the cols through String Indexer val features=data.drop("Label1").drop("Label2").drop("Key") // Since I do not know my max categories possible, I find it out // and use it for maxBins parameter in RF val distinct_col_counts=features.columns.map(x => data.select(x).distinct().count ).max val transformers: Array[org.apache.spark.ml.PipelineStage] = features.columns.map( cname => new StringIndexer().setInputCol(cname).setOutputCol(s"${cname}_index").fit(features) ) val assembler = new VectorAssembler() .setInputCols(features.columns.map(cname => s"${cname}_index")) .setOutputCol("features") val labelIndexer2 = new StringIndexer() .setInputCol("prog_label2") .setOutputCol("Label2") .fit(data) val labelIndexer1 = new StringIndexer() .setInputCol("orig_label1") .setOutputCol("Label1") .fit(data) val rf = new RandomForestClassifier() .setLabelCol("Label1") .setFeaturesCol("features") .setNumTrees(100) .setMaxBins(distinct_col_counts.toInt) val labelConverter = new IndexToString() .setInputCol("prediction") .setOutputCol("predictedLabel") .setLabels(labelIndexer1.labels) // Split into train and test val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) trainingData.cache() testData.cache() // Running only for one label for now Label1 val stages: Array[org.apache.spark.ml.PipelineStage] =transformers :+ labelIndexer1 :+ assembler :+ rf :+ labelConverter //:+ labelIndexer2 val pipeline=new Pipeline().setStages(stages) val model=pipeline.fit(trainingData) val predictions = model.transform(testData)