2017年3月27日 - 慢慢的回味

原创文章，转载请注明： 转载自慢慢的回味

本文链接地址: Logistic Regression(逻辑回归) in ML

Content:

概览

Code in org.apache.spark.ml.classification.LogisticRegressionSuite.scala

class LogisticRegressionSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
 
  import testImplicits._ //Mark A
 
  private val seed = 42
  @transient var smallBinaryDataset: Dataset[_] = _  //Mark B
 
  override def beforeAll(): Unit = {
    super.beforeAll()
 
    smallBinaryDataset = generateLogisticInput(1.0, 1.0, nPoints = 100, seed = seed).toDF()  //Mark C
 
......
 
  test("logistic regression: default params") {
    val lr = new LogisticRegression
    assert(lr.getLabelCol === "label")
    assert(lr.getFeaturesCol === "features")
    assert(lr.getPredictionCol === "prediction")
    assert(lr.getRawPredictionCol === "rawPrediction")
    assert(lr.getProbabilityCol === "probability")
    assert(lr.getFamily === "auto")
    assert(!lr.isDefined(lr.weightCol))
    assert(lr.getFitIntercept)
    assert(lr.getStandardization)
    val model = lr.fit(smallBinaryDataset)
    model.transform(smallBinaryDataset)
      .select("label", "probability", "prediction", "rawPrediction")
      .collect()
    assert(model.getThreshold === 0.5)
    assert(model.getFeaturesCol === "features")
    assert(model.getPredictionCol === "prediction")
    assert(model.getRawPredictionCol === "rawPrediction")
    assert(model.getProbabilityCol === "probability")
    assert(model.intercept !== 0.0)
    assert(model.hasParent)
 
    // copied model must have the same parent.
    MLTestingUtils.checkCopy(model)
    assert(model.hasSummary)
    val copiedModel = model.copy(ParamMap.empty)
    assert(copiedModel.hasSummary)
    model.setSummary(None)
    assert(!model.hasSummary)
  }
 
......
 
object LogisticRegressionSuite {
 
......
 
  // Generate input of the form Y = logistic(offset + scale*X)
  def generateLogisticInput(
      offset: Double,
      scale: Double,
      nPoints: Int,
      seed: Int): Seq[LabeledPoint] = {
    val rnd = new Random(seed)
    val x1 = Array.fill[Double](nPoints)(rnd.nextGaussian())
 
    val y = (0 until nPoints).map { i =&gt;
      val p = 1.0 / (1.0 + math.exp(-(offset + scale * x1(i))))
      if (rnd.nextDouble() &lt; p) 1.0 else 0.0 } val testData = (0 until nPoints).map(i =&gt; LabeledPoint(y(i), Vectors.dense(Array(x1(i)))))
    testData
  }
......

class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ //Mark A private val seed = 42 @transient var smallBinaryDataset: Dataset[_] = _ //Mark B override def beforeAll(): Unit = { super.beforeAll() smallBinaryDataset = generateLogisticInput(1.0, 1.0, nPoints = 100, seed = seed).toDF() //Mark C ...... test("logistic regression: default params") { val lr = new LogisticRegression assert(lr.getLabelCol === "label") assert(lr.getFeaturesCol === "features") assert(lr.getPredictionCol === "prediction") assert(lr.getRawPredictionCol === "rawPrediction") assert(lr.getProbabilityCol === "probability") assert(lr.getFamily === "auto") assert(!lr.isDefined(lr.weightCol)) assert(lr.getFitIntercept) assert(lr.getStandardization) val model = lr.fit(smallBinaryDataset) model.transform(smallBinaryDataset) .select("label", "probability", "prediction", "rawPrediction") .collect() assert(model.getThreshold === 0.5) assert(model.getFeaturesCol === "features") assert(model.getPredictionCol === "prediction") assert(model.getRawPredictionCol === "rawPrediction") assert(model.getProbabilityCol === "probability") assert(model.intercept !== 0.0) assert(model.hasParent) // copied model must have the same parent. MLTestingUtils.checkCopy(model) assert(model.hasSummary) val copiedModel = model.copy(ParamMap.empty) assert(copiedModel.hasSummary) model.setSummary(None) assert(!model.hasSummary) } ...... object LogisticRegressionSuite { ...... // Generate input of the form Y = logistic(offset + scale*X) def generateLogisticInput( offset: Double, scale: Double, nPoints: Int, seed: Int): Seq[LabeledPoint] = { val rnd = new Random(seed) val x1 = Array.fill[Double](nPoints)(rnd.nextGaussian()) val y = (0 until nPoints).map { i => val p = 1.0 / (1.0 + math.exp(-(offset + scale * x1(i)))) if (rnd.nextDouble() < p) 1.0 else 0.0 } val testData = (0 until nPoints).map(i => LabeledPoint(y(i), Vectors.dense(Array(x1(i))))) testData } ......

注意到Mark A, Mark B and Mark C，方法”generateLogisticInput”返回Seq结果，但是”smallBinaryDataset”是Dataset类型。注意到在Mark A处，”localSeqToDatasetHolder”方法将把Seq隐式转换为DatasetHolder，而DatasetHolder有一个toDF()方法可以把Dateset转换为DataFrame。

abstract class SQLImplicits {
......
  /**
   * Creates a [[Dataset]] from a local Seq.
   * @since 1.6.0
   */
  implicit def localSeqToDatasetHolder[T : Encoder](s: Seq[T]): DatasetHolder[T] = {
    DatasetHolder(_sqlContext.createDataset(s))
  }
......
 
case class DatasetHolder[T] private[sql](private val ds: Dataset[T]) {
 
  // This is declared with parentheses to prevent the Scala compiler from treating
  // `rdd.toDS("1")` as invoking this toDS and then apply on the returned Dataset.
  def toDS(): Dataset[T] = ds
 
  // This is declared with parentheses to prevent the Scala compiler from treating
  // `rdd.toDF("1")` as invoking this toDF and then apply on the returned DataFrame.
  def toDF(): DataFrame = ds.toDF()
 
  def toDF(colNames: String*): DataFrame = ds.toDF(colNames : _*)
}

构造Model

用LogisticRegression来构建model，方法”val model = lr.fit(smallBinaryDataset)”。
继续阅读“Logistic Regression(逻辑回归) in ML”本作品采用知识共享署名 4.0 国际许可协议进行许可。

一	二	三	四	五	六	日
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31