1

我是初学者Scala

我正在尝试创建一个接受 aProbabilisticClassifier作为输入并产生一个CrossValidator模型作为输出的对象:

import org.apache.spark.ml.classification.{ProbabilisticClassifier, ProbabilisticClassificationModel}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}

import constants.Const

object MyModels {

  def loadOrCreateModel[A, M, T](
    model: ProbabilisticClassifier[Vector[T], A, M],
    paramGrid: Array[ParamMap]): CrossValidator = {

    // Binary evaluator.
    val binEvaluator = (
      new BinaryClassificationEvaluator()
        .setLabelCol("yCol")
      )

    // Cross validator.
    val cvModel = (
      new CrossValidator()
        .setEstimator(model)
        .setEvaluator(binEvaluator)
        .setEstimatorParamMaps(paramGrid)
        .setNumFolds(3)
      )
    cvModel
  }
}

但这给了我:

sbt package
[info] Loading project definition from somepath/project
[info] Loading settings from build.sbt ...
[info] Set current project to xxx (in build file:somepath/)
[info] Compiling 1 Scala source to somepath/target/scala-2.11/classes ...
[error] somepath/src/main/scala/models.scala:11:12: type arguments [Vector[T],A,M] do not conform to class ProbabilisticClassifier's type parameter bounds [FeaturesType,E <: org.apache.spark.ml.classification.ProbabilisticClassifier[FeaturesType,E,M],M <: org.apache.spark.ml.classification.ProbabilisticClassificationModel[FeaturesType,M]]
[error]     model: ProbabilisticClassifier[Vector[T], A, M],
[error]            ^
[error] one error found
[error] (Compile / compileIncremental) Compilation failed
[error] Total time: 3 s, completed Mar 31, 2018 4:22:31 PM
makefile:127: recipe for target 'target/scala-2.11/classes/models/XModels.class' failed
make: *** [target/scala-2.11/classes/models/XModels.class] Error 1

我已经尝试了几种[A, M, T]参数组合以及方法参数中的不同类型。

这个想法是能够将 aLogisticRegression或 aRandomForestClassifier输入到这个函数中。从文档中:

class LogisticRegression extends ProbabilisticClassifier[Vector, LogisticRegression, LogisticRegressionModel] with LogisticRegressionParams with DefaultParamsWritable with Logging
class RandomForestClassifier extends ProbabilisticClassifier[Vector, RandomForestClassifier, RandomForestClassificationModel] with RandomForestClassifierParams with DefaultParamsWritable

有人可以指出我在哪里可以学习实现这种方法所需的资源吗?

我正在使用Spark2.1.0


编辑 01

谢谢@Andrey Tyukin,

很抱歉,代码不可重现。它实际上是一个字符串。您的代码确实有效,但也许我表达自己错误:

<console>:35: error: type mismatch;
found   : org.apache.spark.ml.classification.LogisticRegression
required: org.apache.spark.ml.classification.ProbabilisticClassifier[Vector[?],?,?]
    val cvModel = models.TalkingDataModels.loadOrCreateModel(logistic_regressor, paramGrid)

所以也许我的想法从一开始就错了。是否可以创建一个同时接受两者LogisticRegression或一个RandomForestClassifier对象的方法?

  • 将代码编辑为MCVE

    import org.apache.spark.ml.classification.{ProbabilisticClassifier, ProbabilisticClassificationModel}
    import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
    import org.apache.spark.ml.param.ParamMap
    import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
    import org.apache.spark.ml.classification.LogisticRegression
    
    object MyModels {
    
    def main(array: Array[String]): Unit = {
        val logisticRegressor = (
            new LogisticRegression()
                .setFeaturesCol("yCol")
                .setLabelCol("labels")
                .setMaxIter(10)
            )
        val paramGrid = (
            new ParamGridBuilder()
            .addGrid(logisticRegressor.regParam, Array(0.01, 0.1, 1))
            .build()
        )
        loadOrCreateModel(logisticRegressor, paramGrid)
        println()
    }
    
    def loadOrCreateModel[
        F,
        M <: ProbabilisticClassificationModel[Vector[F], M],
        P <: ProbabilisticClassifier[Vector[F], P, M]
        ](
        probClassif: ProbabilisticClassifier[Vector[F], P, M],
        paramGrid: Array[ParamMap]
        ): CrossValidator = {
    
        // Binary evaluator.
        val binEvaluator =
            new BinaryClassificationEvaluator()
            .setLabelCol("y")
    
        // Cross validator.
        val cvModel =
            new CrossValidator()
            .setEstimator(probClassif)
            .setEvaluator(binEvaluator)
            .setEstimatorParamMaps(paramGrid)
            .setNumFolds(3)
    
        cvModel
        }
    }
    
4

1 回答 1

1

这在这里编译,但我不得不扔掉你的constants.Const.yColumn-string,并用魔法值替换它"y"

import org.apache.spark.ml.classification.{ProbabilisticClassifier, ProbabilisticClassificationModel}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}

object CrossValidationExample {

  def loadOrCreateModel[
    F, 
    M <: ProbabilisticClassificationModel[Vector[F], M],
    P <: ProbabilisticClassifier[Vector[F], P, M]
  ](
    probClassif: ProbabilisticClassifier[Vector[F], P, M],
    paramGrid: Array[ParamMap]
  ): CrossValidator = {

    // Binary evaluator.
    val binEvaluator = 
      new BinaryClassificationEvaluator()
      .setLabelCol("y")

    // Cross validator.
    val cvModel = 
      new CrossValidator()
      .setEstimator(probClassif)
      .setEvaluator(binEvaluator)
      .setEstimatorParamMaps(paramGrid)
      .setNumFolds(3)

    cvModel
  }
}

在定义通用参数列表之前,在您的脑海中执行拓扑排序可能会有所帮助,以了解哪些参数取决于哪些其他参数。

在这里,模型取决于特征的类型,而概率分类器既取决于特征的类型,也取决于模型的类型。因此,按featuresmodelclassifier的顺序声明参数可能更有意义。然后你必须得到正确的 F 有界多态性。


啊,顺便说一下,埃及方括号样式的缩进是恕我直言,这是缩进多个参数列表的唯一明智的方法,类型参数的长度大约为 50 英里(不幸的是,您无法更改类型参数的长度,它们倾向于在我见过的每个机器学习库中都相当冗长)。


编辑(第二个 MCVE 部分的答案)

这是一个非常直接的概括。如果它想要linalg.Vector代替Vector[Feature],那么也可以抽象一下:

import org.apache.spark.ml.classification.{ProbabilisticClassifier, ProbabilisticClassificationModel}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.linalg.{Vector => LinalgVector}

object CrossValidationExample {

  def main(array: Array[String]): Unit = {
      val logisticRegressor = (
          new LogisticRegression()
              .setFeaturesCol("yCol")
              .setLabelCol("labels")
              .setMaxIter(10)
          )
      val paramGrid = (
          new ParamGridBuilder()
          .addGrid(logisticRegressor.regParam, Array(0.01, 0.1, 1))
          .build()
      )

      loadOrCreateModel(logisticRegressor, paramGrid)

      val rfc: RandomForestClassifier = ???
      loadOrCreateModel(rfc, paramGrid)
  }

  def loadOrCreateModel[
    FeatVec,
    M <: ProbabilisticClassificationModel[FeatVec, M],
    P <: ProbabilisticClassifier[FeatVec, P, M]
  ](
    probClassif: ProbabilisticClassifier[FeatVec, P, M],
    paramGrid: Array[ParamMap]
  ): CrossValidator = {
    // Binary evaluator.
    val binEvaluator =
        new BinaryClassificationEvaluator()
        .setLabelCol("y")

    // Cross validator.
    val cvModel =
        new CrossValidator()
        .setEstimator(probClassif)
        .setEvaluator(binEvaluator)
        .setEstimatorParamMaps(paramGrid)
        .setNumFolds(3)

    cvModel
  }
}
于 2018-03-31T19:52:39.983 回答