排序及topN
2017-03-28 本文已影响653人
ibunny
基于排序机制的wordcount
按照每个单词出现次数的顺序,降序排序
import java.util.Arrays;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
/**
* 排序的wordcount程序
* @author Administrator
*
*/
public class SortWordCount {
public static void main(String[] args) {
// 创建SparkConf和JavaSparkContext
SparkConf conf = new SparkConf()
.setAppName("SortWordCount")
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
// 创建lines RDD
JavaRDD<String> lines = sc.textFile("C://Users//Administrator//Desktop//spark.txt");
// 执行我们之前做过的单词计数
JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
private static final long serialVersionUID = 1L;
@Override
public Iterable<String> call(String t) throws Exception {
return Arrays.asList(t.split(" "));
}
});
JavaPairRDD<String, Integer> pairs = words.mapToPair(
new PairFunction<String, String, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, Integer> call(String t) throws Exception {
return new Tuple2<String, Integer>(t, 1);
}
});
JavaPairRDD<String, Integer> wordCounts = pairs.reduceByKey(
new Function2<Integer, Integer, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
// 到这里为止,就得到了每个单词出现的次数
// 但是,问题是,我们的新需求,是要按照每个单词出现次数的顺序,降序排序
// wordCounts RDD内的元素是什么?应该是这种格式的吧:(hello, 3) (you, 2)
// 我们需要将RDD转换成(3, hello) (2, you)的这种格式,才能根据单词出现次数进行排序(使用sortByKey)
// 进行key-value的反转映射
JavaPairRDD<Integer, String> countWords = wordCounts.mapToPair(
new PairFunction<Tuple2<String,Integer>, Integer, String>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<Integer, String> call(Tuple2<String, Integer> t)
throws Exception {
return new Tuple2<Integer, String>(t._2, t._1);
}
});
// 按照key进行排序
JavaPairRDD<Integer, String> sortedCountWords = countWords.sortByKey(false);
// 再次将value-key进行反转映射
JavaPairRDD<String, Integer> sortedWordCounts = sortedCountWords.mapToPair(
new PairFunction<Tuple2<Integer,String>, String, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, Integer> call(Tuple2<Integer, String> t)
throws Exception {
return new Tuple2<String, Integer>(t._2, t._1);
}
});
// 到此为止,我们获得了按照单词出现次数排序后的单词计数
// 打印出来
sortedWordCounts.foreach(new VoidFunction<Tuple2<String,Integer>>() {
private static final long serialVersionUID = 1L;
@Override
public void call(Tuple2<String, Integer> t) throws Exception {
System.out.println(t._1 + " appears " + t._2 + " times.");
}
});
// 关闭JavaSparkContext
sc.close();
}
}
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
/**
* @author Administrator
*/
object SortWordCount {
def main(args: Array[String]) {
val conf = new SparkConf()
.setAppName("SortWordCount")
.setMaster("local")
val sc = new SparkContext(conf)
val lines = sc.textFile("C://Users//Administrator//Desktop//spark.txt", 1)
val words = lines.flatMap { line => line.split(" ") }
val pairs = words.map { word => (word, 1) }
val wordCounts = pairs.reduceByKey(_ + _)
val countWords = wordCounts.map(wordCount => (wordCount._2, wordCount._1))
val sortedCountWords = countWords.sortByKey(false)
val sortedWordCounts = sortedCountWords.map(sortedCountWord => (sortedCountWord._2, sortedCountWord._1))
sortedWordCounts.foreach(sortedWordCount => println(
sortedWordCount._1 + " appear " + sortedWordCount._2 + " times."))
}
}
二次排序
安装文件的第一列排序,如果第一列相同,则按照第二列排序。
/*******SecondarySortKey.java*******/
import java.io.Serializable;
import scala.math.Ordered;
/**
* 自定义的二次排序key
* @author Administrator
*
*/
public class SecondarySortKey implements Ordered<SecondarySortKey>, Serializable {
private static final long serialVersionUID = -2366006422945129991L;
// 首先在自定义key里面,定义需要进行排序的列
private int first;
private int second;
public SecondarySortKey(int first, int second) {
this.first = first;
this.second = second;
}
@Override
public boolean $greater(SecondarySortKey other) {
// 如果第一列更大则大,第一列相等的话就比较第二列
if(this.first > other.getFirst()) {
return true;
} else if(this.first == other.getFirst() &&
this.second > other.getSecond()) {
return true;
}
return false;
}
@Override
public boolean $greater$eq(SecondarySortKey other) {
if(this.$greater(other)) {
return true;
} else if(this.first == other.getFirst() &&
this.second == other.getSecond()) {
return true;
}
return false;
}
@Override
public boolean $less(SecondarySortKey other) {
if(this.first < other.getFirst()) {
return true;
} else if(this.first == other.getFirst() &&
this.second < other.getSecond()) {
return true;
}
return false;
}
@Override
public boolean $less$eq(SecondarySortKey other) {
if(this.$less(other)) {
return true;
} else if(this.first == other.getFirst() &&
this.second == other.getSecond()) {
return true;
}
return false;
}
@Override
public int compare(SecondarySortKey other) {
if(this.first - other.getFirst() != 0) {
return this.first - other.getFirst();
} else {
return this.second - other.getSecond();
}
}
@Override
public int compareTo(SecondarySortKey other) {
if(this.first - other.getFirst() != 0) {
return this.first - other.getFirst();
} else {
return this.second - other.getSecond();
}
}
// 为要进行排序的多个列,提供getter和setter方法,以及hashcode和equals方法
public int getFirst() {
return first;
}
public void setFirst(int first) {
this.first = first;
}
public int getSecond() {
return second;
}
public void setSecond(int second) {
this.second = second;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + first;
result = prime * result + second;
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
SecondarySortKey other = (SecondarySortKey) obj;
if (first != other.first)
return false;
if (second != other.second)
return false;
return true;
}
}
/**********SecondarySort.java***********/
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
/**
* 二次排序
* 1、实现自定义的key,要实现Ordered接口和Serializable接口,在key中实现自己对多个列的排序算法
* 2、将包含文本的RDD,映射成key为自定义key,value为文本的JavaPairRDD
* 3、使用sortByKey算子按照自定义的key进行排序
* 4、再次映射,剔除自定义的key,只保留文本行
* @author Administrator
*
*/
public class SecondarySort {
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName("SecondarySort")
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lines = sc.textFile("C://Users//Administrator//Desktop//sort.txt");
JavaPairRDD<SecondarySortKey, String> pairs = lines.mapToPair(
new PairFunction<String, SecondarySortKey, String>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<SecondarySortKey, String> call(String line) throws Exception {
String[] lineSplited = line.split(" ");
SecondarySortKey key = new SecondarySortKey(
Integer.valueOf(lineSplited[0]),
Integer.valueOf(lineSplited[1]));
return new Tuple2<SecondarySortKey, String>(key, line);
}
});
JavaPairRDD<SecondarySortKey, String> sortedPairs = pairs.sortByKey();
JavaRDD<String> sortedLines = sortedPairs.map(
new Function<Tuple2<SecondarySortKey,String>, String>() {
private static final long serialVersionUID = 1L;
@Override
public String call(Tuple2<SecondarySortKey, String> v1) throws Exception {
return v1._2; // 不返回key
}
});
sortedLines.foreach(new VoidFunction<String>() {
private static final long serialVersionUID = 1L;
@Override
public void call(String t) throws Exception {
System.out.println(t);
// 1 3
// 1 5
// 2 1
// 2 4
}
});
sc.close();
}
}
/***********SecondSortKey.scala***********/
class SecondSortKey(val first: Int, val second: Int)
extends Ordered[SecondSortKey] with Serializable {
def compare(that: SecondSortKey): Int = {
if(this.first - that.first != 0) {
this.first - that.first
} else {
this.second - that.second
}
}
}
/***********SecondSort.scala***********/
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object SecondSort {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName("SecondSort")
.setMaster("local")
val sc = new SparkContext(conf)
val lines = sc.textFile("C://Users//Administrator//Desktop//sort.txt", 1)
val pairs = lines.map { line => (
new SecondSortKey(line.split(" ")(0).toInt, line.split(" ")(1).toInt),
line)}
val sortedPairs = pairs.sortByKey()
val sortedLines = sortedPairs.map(sortedPair => sortedPair._2)
sortedLines.foreach { sortedLine => println(sortedLine) }
}
}
topN
对文件内的数字,取最大的前3个
3
5
6
7
1
4
5
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
/**
* 取最大的前3个数字
* @author Administrator
*
*/
public class Top3 {
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName("Top3")
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lines = sc.textFile("C://Users//Administrator//Desktop//top.txt");
JavaPairRDD<Integer, String> pairs = lines.mapToPair(
new PairFunction<String, Integer, String>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<Integer, String> call(String t) throws Exception {
return new Tuple2<Integer, String>(Integer.valueOf(t), t);
}
});
JavaPairRDD<Integer, String> sortedPairs = pairs.sortByKey(false);
JavaRDD<Integer> sortedNumbers = sortedPairs.map(
new Function<Tuple2<Integer,String>, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Integer call(Tuple2<Integer, String> v1) throws Exception {
return v1._1;
}
});
List<Integer> sortedNumberList = sortedNumbers.take(3);
for(Integer num : sortedNumberList) {
System.out.println(num);
}
sc.close();
}
}
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object Top3 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName("Top3")
.setMaster("local")
val sc = new SparkContext(conf)
val lines = sc.textFile("C://Users//Administrator//Desktop//top.txt", 1)
val pairs = lines.map { line => (line.toInt, line) }
val sortedPairs = pairs.sortByKey(false)
val sortedNumbers = sortedPairs.map(sortedPair => sortedPair._1)
val top3Number = sortedNumbers.take(3)
for(num <- top3Number) {
println(num)
}
}
}
班级内的学生成绩,取出前3名
class1 90
class2 56
class1 87
class1 76
class2 88
class1 95
class1 74
class2 87
class2 67
class2 77
import java.util.Arrays;
import java.util.Iterator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
/**
* 分组取top3
* @author Administrator
*
*/
public class GroupTop3 {
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName("Top3")
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lines = sc.textFile("C://Users//Administrator//Desktop//score.txt");
JavaPairRDD<String, Integer> pairs = lines.mapToPair(
new PairFunction<String, String, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, Integer> call(String line) throws Exception {
String[] lineSplited = line.split(" ");
return new Tuple2<String, Integer>(lineSplited[0],
Integer.valueOf(lineSplited[1]));
}
});
JavaPairRDD<String, Iterable<Integer>> groupedPairs = pairs.groupByKey();
JavaPairRDD<String, Iterable<Integer>> top3Score = groupedPairs.mapToPair(
new PairFunction<Tuple2<String,Iterable<Integer>>, String, Iterable<Integer>>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, Iterable<Integer>> call(
Tuple2<String, Iterable<Integer>> classScores)
throws Exception {
Integer[] top3 = new Integer[3];
String className = classScores._1;
Iterator<Integer> scores = classScores._2.iterator();
while(scores.hasNext()) {
Integer score = scores.next();
for(int i = 0; i < 3; i++) {
if(top3[i] == null) {
top3[i] = score;
break;
} else if(score > top3[i]) {
for(int j = 2; j > i; j--) {
top3[j] = top3[j - 1];
}
top3[i] = score;
break;
}
}
}
return new Tuple2<String,
Iterable<Integer>>(className, Arrays.asList(top3));
}
});
top3Score.foreach(new VoidFunction<Tuple2<String,Iterable<Integer>>>() {
private static final long serialVersionUID = 1L;
@Override
public void call(Tuple2<String, Iterable<Integer>> t) throws Exception {
System.out.println("class: " + t._1);
Iterator<Integer> scoreIterator = t._2.iterator();
while(scoreIterator.hasNext()) {
Integer score = scoreIterator.next();
System.out.println(score);
}
System.out.println("=======================================");
}
});
sc.close();
}
}