Spark Dataset learning on udemy

2019-06-20  本文已影响0人  赐我理由在披甲上阵

DataSet and DataFrame

https://www.cnblogs.com/starwater/p/6841807.html

DataFrame

It is basicall an oddity of row objects that's really all it is.
And a row can in turn contain any number of columns of informantion
that may be of whatever type you want.
more like data base
have schema and let spark represent it more efficiently
image.png

DataSet

DataFrame is also DataSet. 
DataFrame = DataSet[Row]

DataSet is more generally a set of structure data no necessarily a row. It could be a specific type.
DataSet[Person], DataSet[(String, Double)]

DataFrames schema is inferred at run time
DataSets can be inferred at compile time
image.png
image.png image.png
import org.apache.spark._
import org.apache.spark.SparkContext._

object SparkTest {

  case class Student(name: String, score: Double)

  def parseLine(line: String) = {
    val fields = line.split(",")
    val student: Student = Student(fields(0), fields(1).toDouble)
  }

  def showTopGradeStudent(df: DataFrame) = {
    val topGrade = df.agg({"grade","max"}).collect()
    topGrade.foreach(println)
  }

  def main(args: Array[String]) {

    val sc = new SparkContext("local[*]", "TopGrade");
    import spark.implicits._
    val lines = sc.textFile("../studnet.csv");
    val df = lines.map(parseLine).toDF().cache()
    showTopGradeStudent(df)

  }

  def showOut() = {
    schemaStudent.printSchema()
    schemaStudent.creatOrReplaceTempView()
  }
}


import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.sql.functions.{min, max} 

object SparkTest {

  case class Student(name: String, score: Double)

  def parseLine(line: String) = {
    val fields = line.split(",")
    val student: Student = Student(fields(0), fields(1).toDouble)
    student
  }

  def main(args: Array[String]) {

    val sc = new SparkContext("local[*]", "TopGrade");
    import spark.implicits._
    val lines = sc.textFile("../studnet.csv");
    val df = lines.map(parseLine).toDF()
    val topGrade = df.agg(max("score")).collect
    
    
    
    
    //     var data = Array( "a,100","b,100","c,98","d,78")
    //     val lines = sc.parallelize(data)
//     df.foreach(item => println(item.getAs[String]("name")))
//     showTopGradeStudent(df)
//     ds.foreach(item => println(item.name)

// -----------
//     val stdDS = lines.map(parseLine).toDS().cache()
  }

//   def showOut() = {
//     schemaStudent.printSchema()
//     schemaStudent.creatOrReplaceTempView()
//   }
}
上一篇 下一篇

猜你喜欢

热点阅读