Java Web 提交job到远程Hadoop

2017-05-07 本文已影响201人 2010jing

之前的一篇文章Eclipse编写MapReduce程序，其实执行是在local。
网上查找资料如何在java web项目内提交job到服务端执行MapReduce的教程，奈何参考网上的资料却无法成功执行，尝试了各种错误，终于成功，故有此文，以作记录。

1.创建一个普通的java web项目命名为 WordCountPage

本文是以wordcount为例，仅作一个抛砖引玉作用。

2代码

TW3.java

//TW3.java
import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class TWC3 {
    public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String valueString = value.toString();

            StringTokenizer itr = new StringTokenizer(valueString);
            while (itr.hasMoreTokens()) {
                word.set(itr.nextToken());
                context.write(word, one);
              }
            
        }
    }

    public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable result = new IntWritable();

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
             int sum = 0;
              for (IntWritable val : values) {
                sum += val.get();
              }
              result.set(sum);
              context.write(key, result);
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);
        job.setJarByClass(TWC3.class);
        job.setJar(args[2]);
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.setInputPaths(job, new Path(args[0]));

        Path output = new Path(args[1]);
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(output)) {
            fs.delete(output, true);
        }
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        // System.exit(job.waitForCompletion(true)?0:1);
        job.waitForCompletion(true);

    }
}

WCServlet.java

//WCServlet.java
import java.io.IOException;
import java.util.Calendar;
import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

/**
 * Servlet implementation class WCServlet
 */
@WebServlet("/WCServlet")
public class WCServlet extends HttpServlet {
    private static final long serialVersionUID = 1L;
       
    /**
     * @see HttpServlet#HttpServlet()
     */
    public WCServlet() {
        super();
    }

    /**
     * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
     */
    protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        response.getWriter().append("Served at: ").append(request.getContextPath());
        String[] args=new String[3];  
         Calendar now = Calendar.getInstance(); 
        args[0]="hdfs://master:9000/input/";
        args[1]="hdfs://master:9000/output/"+ now.getTimeInMillis();

        String a = System.getProperty("catalina.home") +"/lib/userlib/TWC2-3.jar";
//      System.out.println("========a======="+a );
        response.getWriter().append("Served at: " + a);

        args[2] = a;
        try {
            TWC3.main(args);
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();

        }
        
    }

    /**
     * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
     */
    protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        // TODO Auto-generated method stub
        doGet(request, response);
    }

}

index.jsp

<%@ page language="java" contentType="text/html; charset=ISO-8859-1"
    pageEncoding="ISO-8859-1"%>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
<title>Insert title here</title>
</head>
<body>

    <form action="WCServlet" method="post">
        <input type="text" name="keyword"> <br>
        <input type="submit" value="Submit">
    </form>
 
    <hr>
</body>
</html>

简单代码解释：

index.jsp

1.里面一个表单，点击提交后，会到WCServlet处理相关业务。

WCServlet.java

1.doPost() 函数调用 doGet()函数。
2.创建一个数组String[] args ，并设置值。args[0]="hdfs://master:9000/input/";为MR程序要处理的数据源地址。args[1]="hdfs://master:9000/output/"+ now.getTimeInMillis(); 为结果保存指定文件夹。
3.由于Hadoop服务器上并没有你的MR程序，你需要提前上传。
String a = System.getProperty("catalina.home") +"/lib/userlib/TWC2-3.jar"; 为MR调用的jar文件路径。
args[2] = a; 把该值存在数组，以被调用。
4.TWC3.main(args); 调用 TWC3 该类的main函数，并且把数组作为参数传递过去。

TWC3.java

1.MyMapper 和 MyReducer 分别实现map 和 reduce。
2.main()函数类配置相关信息
3.job.setJar(args[2]); 这个设置指定执行的jar文件，上面提到，服务器端并没有你自己实现的MR程序，所以得手动打包成jar文件，提前上传。

3.给Eclipse 添加相关的文件

related-files.png

这几个文件可以从hadoop 拷下来(filezilla / xftp等软件) 并作一些简单修改。

core-site.xml


<!-- Put site-specific property overrides in this file. -->

<configuration>
    <property>
        <name>fs.default.name</name>
        <value>hdfs://master:9000</value>
    </property>
    <property>
        <name>hadoop.tmp.dir</name>
        <value>file:/home/ubuntu/developer/hadoop-2.7.3/tmp</value>
    </property>
    <property>
        <name>io.file.buffer.size</name>
        <value>131702</value>
    </property>
</configuration>

hdfs-site.xml

<!-- Put site-specific property overrides in this file. -->

<configuration>
    <property>
        <name>dfs.namenode.name.dir</name>
        <value>file:/home/ubuntu/developer/hadoop-2.7.3/hdfs/name</value>
    </property>
    <property>
        <name>dfs.datanode.data.dir</name>
        <value>file:/home/ubuntu/developer/hadoop-2.7.3/hdfs/data</value>
    </property>
    <property>
        <name>dfs.replication</name>
        <value>2</value>
    </property>
    <property>
        <name>dfs.namenode.secondary.http-address</name>
        <value>master:9001</value>
    </property>
    <property>
        <name>dfs.webhdfs.enabled</name>
        <value>true</value>
    </property>
    <property>
        <name>dfs.namenode.datanode.registration.ip-hostname-check</name>
        <value>false</value>
    </property>
    <property>
        <name>dfs.permissions</name>
        <value>false</value>
    </property> 
</configuration>

mapred.site.xml

<!-- Put site-specific property overrides in this file. -->

<configuration>

    <property>
        <name>mapred.remote.os</name>
        <value>Linux</value>
    </property>
     
    <property>
        <name>mapreduce.app-submission.cross-platform</name>
        <value>true</value>
    </property>
    
    <property>
        <name>mapreduce.application.classpath</name>
        <value>/home/ubuntu/developer/hadoop-2.7.3/etc/hadoop,
                /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/common/*,
                /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/common/lib/*,
                /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/hdfs/*,
                /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/hdfs/lib/*,
                /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/mapreduce/*,
                /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/mapreduce/lib/*,
                /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/yarn/*,
                /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/yarn/lib/*
        </value>
    </property>

<!-- ===========================================  -->
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
    <property>
        <name>mapreduce.jobhistory.address</name>
        <value>master:10020</value>
    </property>
    <property>
        <name>mapreduce.jobhistory.webapp.address</name>
        <value>master:19888</value>
    </property>


    <property>  
        <name>mapreduce.map.memory.mb</name>  
        <value>1024</value>  
    </property>  
    <property>  
        <name>mapreduce.reduce.memory.mb</name>  
        <value>1024</value>  
    </property>  
    <property>  
        <name>mapreduce.map.java.opts</name>  
        <value>-Xmx512m</value>  
    </property>  
    <property>  
        <name>mapreduce.reduce.java.opts</name>  
        <value>-Xmx512m</value>  
    </property>  
</configuration>

yarn-site.xml

<configuration>

<!-- Site specific YARN configuration properties -->

<property>
    <name>yarn.application.classpath</name>
    <value>/home/ubuntu/developer/hadoop-2.7.3/etc/hadoop,
        /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/common/*,
        /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/common/lib/*,
        /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/hdfs/*,
        /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/hdfs/lib/*,
        /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/mapreduce/*,
        /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/mapreduce/lib/*,
        /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/yarn/*,
        /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/yarn/lib/*</value>
</property>
<!-- Site specific YARN configuration properties -->
    <property>  
            <description>The hostname of the RM.</description>  
            <name>yarn.resourcemanager.hostname</name>  
            <value>master</value>  
    </property>  

    <property>
      <name>yarn.nodemanager.aux-services</name>
      <value>mapreduce_shuffle</value>
    </property>
    <property>
        <name>yarn.nodemanager.auxservices.mapreduce.shuffle.class</name>
        <value>org.apache.hadoop.mapred.ShuffleHandler</value>
    </property>
    <property>
        <name>yarn.resourcemanager.address</name>
        <value>${yarn.resourcemanager.hostname}:8032</value>
    </property>
    <property>
        <name>yarn.resourcemanager.scheduler.address</name>
        <value>${yarn.resourcemanager.hostname}:8030</value>
    </property>
    <property>
        <name>yarn.resourcemanager.resource-tracker.address</name>
        <value>${yarn.resourcemanager.hostname}:8031</value>
    </property>
    <property>
        <name>yarn.resourcemanager.admin.address</name>
        <value>${yarn.resourcemanager.hostname}:8033</value>
    </property>
    <property>
        <name>yarn.resourcemanager.webapp.address</name>
        <value>${yarn.resourcemanager.hostname}:8088</value>
    </property>

    <property>  
        <description>The https adddress of the RM web application.</description>  
        <name>yarn.resourcemanager.webapp.https.address</name>  
        <value>${yarn.resourcemanager.hostname}:8090</value>  
   </property>

  <property>  
    <description>List of directories to store localized files in. An   
      application's localized file directory will be found in:  
      ${yarn.nodemanager.local-dirs}/usercache/${user}/appcache/application_${appid}.  
      Individual containers' work directories, called container_${contid}, will  
      be subdirectories of this.  
   </description>  
    <name>yarn.nodemanager.local-dirs</name>  
    <value>/data/hadoop/yarn/local</value>  
  </property>  
  
  <property>  
    <description>Whether to enable log aggregation</description>  
    <name>yarn.log-aggregation-enable</name>  
    <value>true</value>  
  </property>  
  
  <property>  
    <description>Where to aggregate logs to.</description>  
    <name>yarn.nodemanager.remote-app-log-dir</name>  
    <value>/data/tmp/logs</value>  
  </property>  
  
  <property>  
    <description>Amount of physical memory, in MB, that can be allocated   
    for containers.</description>  
    <name>yarn.nodemanager.resource.memory-mb</name>  
    <value>2048</value>  
  </property>  
<property>  
    <name>yarn.scheduler.minimum-allocation-mb</name>  
    <value>512</value>  
</property>  
<property>  
    <name>yarn.nodemanager.vmem-pmem-ratio</name>  
    <value>1.0</value>  
</property>  
<property>  
    <name>yarn.nodemanager.vmem-check-enabled</name>  
    <value>false</value>  
</property> 

</configuration>

log4j.properties

log4j.rootLogger=INFO, stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n
log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=target/spring.log
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n