Spark hello word(加载本地文件和加载hdfs文件

2017-04-12  本文已影响0人  牛马风情





[root@sandbox home]# cd /home/guest/

// 在guest 目录下创建一个文件夹
[root@sandbox guest]# mkdir erhuan
// 在 新建的文件夹中创建一个测试文件
[root@sandbox guest]# cd erhuan/
[root@sandbox erhuan]# vi hellospark


[root@sandbox erhuan]# spark-shell
Spark assembly has been built with Hive, including Datanucleus jars on classpath
17/04/12 14:45:41 INFO SecurityManager: Changing view acls to: root
17/04/12 14:45:41 INFO SecurityManager: Changing modify acls to: root
17/04/12 14:45:41 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(root); users with modify permissions: Set(root)
17/04/12 14:45:41 INFO HttpServer: Starting HTTP Server
17/04/12 14:45:41 INFO Utils: Successfully started service 'HTTP class server' on port 47623.
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 1.2.1

// 省略一堆输出


Spark context available as sc.
scala>  val textFile = sc.textFile("file:///home/guest/erhuan/hellospark")

// 省略一堆输出
textFile: org.apache.spark.rdd.RDD[String] = file:///home/guest/erhuan/hellospark MappedRDD[1] at textFile at <console>:12
scala> textFile.first()
// 省略一堆输出
17/04/12 14:53:27 WARN DomainSocketFactory: The short-circuit local reads feature cannot be 
17/04/12 14:53:27 INFO DAGScheduler: Job 0 finished: first at <console>:15, took 0.306226 s
res0: String = this is a hello word txt
// spark 会记录之前所有的动作但是并不会进行操作,执行action动作后才会启动之前的操作

scala> textFile.saveAsTextFile("file:///home/guest/erhuan/wordres")
17/04/12 14:59:31 INFO DefaultExecutionContext: Starting job: saveAsTextFile at <console>:15
17/04/12 14:59:31 INFO DAGScheduler: Got job 6 (saveAsTextFile at <console>:15) with 2 output partitions (allowLocal=false)
// 省略一堆输出


scala> exit
[root@sandbox erhuan]# cd wordres/
[root@sandbox wordres]# ll
total 4
-rw-r--r-- 1 root root 25 2017-04-12 14:59 part-00000
-rw-r--r-- 1 root root  0 2017-04-12 14:59 part-00001
-rw-r--r-- 1 root root  0 2017-04-12 14:59 _SUCCESS
[root@sandbox wordres]# more part-00000
this is a hello word txt
// 完成spark 对本地文件的加载和写入


//首先向文件拷贝到hdfs上,避免权限问题将 先将文件拷贝到tmp目录下
[root@sandbox tmp]# mv /home/guest/erhuan/hellospark /tmp
[hdfs@sandbox tmp]$ hadoop fs -mkdir -p /user/erhuan
[hdfs@sandbox tmp]$ hadoop fs -put /tmp/hellospark /user/erhuan
[root@sandbox erhuan]# spark-shell
Spark assembly has been built with Hive, including Datanucleus jars on classpath
17/04/12 14:45:41 INFO SecurityManager: Changing view acls to: root
17/04/12 14:45:41 INFO SecurityManager: Changing modify acls to: root
17/04/12 14:45:41 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(root); users with modify permissions: Set(root)
17/04/12 14:45:41 INFO HttpServer: Starting HTTP Server
17/04/12 14:45:41 INFO Utils: Successfully started service 'HTTP class server' on port 47623.
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 1.2.1

// 省略一堆输出


Spark context available as sc.
scala> val textFile = sc.textFile("/user/erhuan/hellospark")
17/04/12 15:33:29 INFO MemoryStore: ensureFreeSpace(277063) called with curMem=684755, // //省略一堆输出

// 执行一次action 查看是否执行成功
scala> textFile.first()
// 省略一堆输出
17/04/12 15:33:32 INFO DAGScheduler: Job 0 finished: first at <console>:15, took 0.543566 s
res3: String = this is a hello word txt


scala> textFile.saveAsTextFile("/user/erhuan/res")
17/04/12 15:36:34 INFO DefaultExecutionContext: Starting job: saveAsTextFile at <console>:15
17/04/12 15:36:34 INFO DAGScheduler: Got job 1 (saveAsTextFile at <console>:15) with 2 output partitions (allowLocal=false)
// 省略一堆输出

[hdfs@sandbox tmp]$ hadoop fs -ls /user/erhuan/res
Found 3 items
-rw-r--r--   1 hdfs hdfs          0 2017-04-12 15:36 /user/erhuan/res/_SUCCESS
-rw-r--r--   1 hdfs hdfs         25 2017-04-12 15:36 /user/erhuan/res/part-00000
-rw-r--r--   1 hdfs hdfs          0 2017-04-12 15:36 /user/erhuan/res/part-00001
[hdfs@sandbox tmp]$ hadoop fs -cat /user/erhuan/res/part-00000
this is a hello word txt

上一篇 下一篇

