利用spark分析点击流日志数据
2018-07-22 本文已影响0人
嗷老板
日志数据的模板
194.237.142.21 - - [18/Sep/2013:06:49:18 +0000] "GET /wp-content/uploads/2013/07/rstudio-git3.png HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
183.49.46.228 - - [18/Sep/2013:06:49:23 +0000] "-" 400 0 "-" "-"
163.177.71.12 - - [18/Sep/2013:06:49:33 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
163.177.71.12 - - [18/Sep/2013:06:49:36 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
101.226.68.137 - - [18/Sep/2013:06:49:42 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
101.226.68.137 - - [18/Sep/2013:06:49:45 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
60.208.6.156 - - [18/Sep/2013:06:49:48 +0000] "GET /wp-content/uploads/2013/07/rcassandra.png HTTP/1.0" 200 185524 "http://cos.name/category/software/packages/" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36"
222.68.172.190 - - [18/Sep/2013:06:49:57 +0000] "GET /images/my.jpg HTTP/1.1" 200 19939 "http://www.angularjs.cn/A00n" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36"
222.68.172.190 - - [18/Sep/2013:06:50:08 +0000] "-" 400 0 "-" "-"
183.195.232.138 - - [18/Sep/2013:06:50:16 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
一、PV值的计算
import org.apache.spark.{SparkConf, SparkContext}
//todo:利用spark分析点击流日志数据-----PV总量
object PV {
def main(args: Array[String]): Unit = {
//1.创建sparkConf对象,本地模式运行,调用两个进程
val sparkConf = new SparkConf().setAppName("PV").setMaster("local[2]")
//2.创建sparkContext对象
val sc = new SparkContext(sparkConf)
//3.读取数据文件
val data = sc.textFile("f:\\text\\access.log")
//4.读取pv总量
println(data.count())
//5.关闭sparkContext
sc.stop()
}
}
二、UV值的计算
import org.apache.spark.{SparkConf, SparkContext}
//todo:利用spark分析点击流日志数据-----UV总量
object UV {
def main(args: Array[String]): Unit = {
//1.创建sparkConf,本地模式运行
val sparkConf = new SparkConf().setAppName("UV").setMaster("local[2]")
//2.创建sparkContxt
val sc = new SparkContext(sparkConf)
//3.读取数据文件
val data = sc.textFile("f:\\text\\access.log")
//4.切分每一行,获取ip地址
val ips = data.map(_.split(" ")(0))
//5.根据ip地址去重
val distinctIPs = ips.distinct()
//6.获取UV总量
println(distinctIPs.count())
//7.关闭sc
sc.stop()
}
}
三、TopN的计算
import org.apache.spark.{SparkConf, SparkContext}
//todo:利用spark分析点击流日志数据-----TopN
object TopN {
def main(args: Array[String]): Unit = {
//1.创建sparkConf
val sparkConf = new SparkConf().setAppName("TopN").setMaster("local[2]")
//2.创建sparkContxt
val sc = new SparkContext(sparkConf)
//3.读取数据文件
val data = sc.textFile("f:\\text\\access.log")
//4.切分每一行,去除不合规的数据,获取url,并将每个url记为1
val urlAndOne = data.map(_.split(" ")).filter(_.length > 10).map(x => (x(10), 1)).filter(_._1!= "\"-\"")
//5.把相同url出现的次数累加
val urlTotalNum = urlAndOne.reduceByKey(_+_)
//6.按照url出现的次数降序排列
val sortUrl = urlTotalNum.sortBy(_._2,false)
//7.获取出现url次数前5位
val result = sortUrl.take(5)
//8.打印结果数据
result.foreach(x=>println(x))
//9.关闭sc
sc.stop()
}
}