docker部署pyspark测试简单的本地wordcount案
2019-01-11 本文已影响2人
数据小菜鸟
#查找
docker search pyspark
#拉取对应的版本
docker pull fokkodriesprong/docker-pyspark
#启动
docker run -it -h sandbox fokkodriesprong/docker-pyspark bash
#启动单核模式
spark-shell --master yarn-client --driver-memory 512m --executor-memory 512m --executor-cores 1
#单机测试代码模式
spark-shell
#证明成功
scala> sc.parallelize(1 to 1000).count()
res0: Long = 1000
#pyspark测试
echo "hello world" > /root/test.txt
echo "hello world 1" >> /root/test.txt
echo "hello world 2" >> /root/test.txt
cat /root/test.txt
pyspark
from operator import add
text = sc.textFile("/root/test.txt")
text.count()
text.flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1)).reduceByKey(add).collect()
