案例需求-求用户在某基站停留的时间
2018-01-05 本文已影响0人
lehuai
ObjectCount1.scala
package day08
import java.net.URL
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* 统计用户对每个学科的各个模块访问的次数的top3
*/
object ObjectCount1 {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("ObjectCount1").setMaster("local[2]")
val sc: SparkContext = new SparkContext(conf)
//获取数据
val file: RDD[String] = sc.textFile("D:/teachingprogram/Spark学习视频/day08/access.txt")
// 提取出url并生成一个元组
val urlAndOne: RDD[(String, Int)] = file.map(line => {
val fields = line.split("\t")
val url = fields(1)
(url, 1)
})
// 把相同的url聚合
val sumedUrl: RDD[(String, Int)] = urlAndOne.reduceByKey(_+_)
// 获取学科信息
val project: RDD[(String, String, Int)] = sumedUrl.map(x => {
val url = x._1 // url
val count = x._2 // 请求url的次数
val project = new URL(url).getHost
(project, url, count)
})
// 用学科来分组,聚合后得到结果
val res: RDD[(String, List[(String, String, Int)])] = project.groupBy(_._1).mapValues(_.toList.sortBy(_._3).reverse.take(3))
println(res.collect().toBuffer)
sc.stop()
}
}