案例需求-求用户在某基站停留的时间

2018-01-05  本文已影响0人  lehuai
ObjectCount1.scala
package day08

import java.net.URL

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

/**
  * 统计用户对每个学科的各个模块访问的次数的top3
  */
object ObjectCount1 {

  def main(args: Array[String]): Unit = {

    val conf: SparkConf = new SparkConf().setAppName("ObjectCount1").setMaster("local[2]")
    val sc: SparkContext = new SparkContext(conf)

    //获取数据
    val file: RDD[String] = sc.textFile("D:/teachingprogram/Spark学习视频/day08/access.txt")

    // 提取出url并生成一个元组
    val urlAndOne: RDD[(String, Int)] = file.map(line => {
      val fields = line.split("\t")
      val url = fields(1)
      (url, 1)
    })

    // 把相同的url聚合
    val sumedUrl: RDD[(String, Int)] = urlAndOne.reduceByKey(_+_)

    // 获取学科信息
    val project: RDD[(String, String, Int)] = sumedUrl.map(x => {
      val url = x._1 // url
      val count = x._2 // 请求url的次数
      val project = new URL(url).getHost

      (project, url, count)
    })

    // 用学科来分组,聚合后得到结果
    val res: RDD[(String, List[(String, String, Int)])] = project.groupBy(_._1).mapValues(_.toList.sortBy(_._3).reverse.take(3))

    println(res.collect().toBuffer)

    sc.stop()
  }
}



上一篇下一篇

猜你喜欢

热点阅读