hadoop3.1.4+hive3.1.2+spark3.1.3

2022-06-15  本文已影响0人  于飞_d529

环境

| yfbd-virtual-machine-01 | 10.216.6.227 | zookeeper datanode namenode spark master/work hive hivemetastore|
| yfbd-virtual-machine-02 | 10.216.6.228 | zookeeper datanode senamenode spark work hive|
| yfbd-virtual-machine-03 | 10.216.6.229 | zookeeper datanode spark work hive|

配置免密登录

1.修改hosts文件

10.216.6.227 yfbd-virtual-machine-01
10.216.6.228 yfbd-virtual-machine-02
10.216.6.229 yfbd-virtual-machine-03

2.免密登录

ssh-keygen
ssh-copy-id -i /home/yfbd/.ssh/id_rsa.pub yfbd-virtual-machine-02
ssh-copy-id -i /home/yfbd/.ssh/id_rsa.pub yfbd-virtual-machine-03

配置环境变量

vim /etc/profile
#JAVA_HOME
export JAVA_HOME=/home/yfbd/bigdata/jdk1.8
export PATH=$PATH:$JAVA_HOME/bin

Hadoop HA部署

1.配置core-site.xml

vim /home/yfbd/bigdata/hadoop-3.1.4/etc/hadoop/core-site.xml
<configuration>
    <property>  
        <name>fs.defaultFS</name>  
        <value>hdfs://ns</value>  
    </property> 
    <!-- 指定hadoop数据的存储目录 -->
    <property>  
        <name>hadoop.tmp.dir</name>  
        <value>/home/yfbd/bigdata/hadoop-3.1.4/data/tmp</value>  
    </property> 
    <!-- 配置HDFS网页登录使用的静态用户为yfbd-->
    <property>  
        <name>hadoop.http.staticuser.user</name>  
        <value>yfbd</value>  
    </property> 
    <!-- 配置该yfbd(superUser)允许通过代理访问的主机节点 -->
    <property>
        <name>hadoop.proxyuser.yfbd.hosts</name>
        <value>*</value>
    </property>
    <!-- 配置该yfbd(superUser)允许通过代理用户所属组 -->
    <property>
        <name>hadoop.proxyuser.yfbd.groups</name>
        <value>*</value>
    </property>
    <!-- 配置该zookeeper地址-->
    <property>  
        <name>ha.zookeeper.quorum</name>  
        <value>yfbd-virtual-machine-01:2181,yfbd-virtual-machine-02:2181,yfbd-virtual-machine-03:2181</value>  
    </property>
</configuration>

2.配置 hdfs-site.xml

<configuration>
  <!--设置副本数量为3 -->  
 <property>  
      <name>dfs.replication</name>  
    <value>3</value>  
 </property> 
 <property>
   <name>dfs.nameservices</name>
   <value>ns</value>
 </property>
    
<!--设置2台高可用namenode -->
 <property>
   <name>dfs.ha.namenodes.ns</name>
   <value>nn1,nn2</value>
 </property>
<!--设置nn1所在地址 监听的rpc地址 -->
 <property>
   <name>dfs.namenode.rpc-address.ns.nn1</name>
   <value>yfbd-virtual-machine-01:8020</value>
 </property>
<!--设置nn1所在地址 监听的http地址 -->
 <property>
   <name>dfs.namenode.http-address.ns.nn1</name>
   <value>yfbd-virtual-machine-01:50070</value>
 </property>
<!--设置nn2所在地址 监听的rpc地址 -->
 <property>
   <name>dfs.namenode.rpc-address.ns.nn2</name>
   <value>yfbd-virtual-machine-02:8020</value>
 </property>
<!--设置nn2所在地址 监听的http地址 -->
 <property>
   <name>dfs.namenode.http-address.ns.nn2</name>
   <value>yfbd-virtual-machine-02:50070</value>
 </property>
<!-- namenode共享的存储位置 -->
<!-- 指定NameNode的元数据在JournalNode日志上的存放位置(一般和zookeeper部署在一起) --> 
 <property>
   <name>dfs.namenode.shared.edits.dir</name>
   <value>qjournal://yfbd-virtual-machine-01:8485;yfbd-virtual-machine-02:8485;yfbd-virtual-machine-03:8485/ns</value>
 </property>
<!-- journal目录 -->
 <property>
   <name>dfs.journalnode.edits.dir</name>
   <value>/home/yfbd/bigdata/hadoop-3.1.4/data/journal</value>
 </property>
<!--是否开启故障自动转移,如果你没有自动故障转移,这个可以先不配 -->
 <property>
   <name>dfs.ha.automatic-failover.enabled</name>
   <value>true</value>
 </property>
<!--配置故障转移代理类 -->
 <!--客户端通过代理访问namenode,访问文件系统,HDFS 客户端与Active 节点通信的Java 类,使用其确定Active 节点是否活跃  --> 
 <property>
   <name>dfs.client.failover.proxy.provider.ns</name>
   <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
 </property>
<!-- 配置隔离机制 -->
<!--这是配置自动切换的方法,有多种使用方法,具体可以看官网,这里是远程登录杀死的方法  --> 
 <property>
   <name>dfs.ha.fencing.methods</name>
   <value>sshfence</value>
 </property>
 <property>
   <name>dfs.permissions.enabled</name>
   <value>false</value>
 </property>
</configuration>

3.配置yarn-site.xml

<configuration>
    <!-- 指定MR走shuffle -->
    <property>  
        <name>yarn.nodemanager.aux-services</name>  
        <value>mapreduce_shuffle</value>  
    </property>
    <!-- Site specific YARN configuration properties -->
    <!--启用resourcemanager ha-->  
    <!--是否开启RM ha,默认是开启的-->  
    <property>  
       <name>yarn.resourcemanager.ha.enabled</name>  
       <value>true</value>  
    </property>  
    <!--声明两台resourcemanager的地址-->  
    <property>  
       <name>yarn.resourcemanager.cluster-id</name>  
       <value>rmcluster</value>  
    </property>  
    <property>  
       <name>yarn.resourcemanager.ha.rm-ids</name>  
       <value>rm1,rm2,rm3</value>  
    </property>  
    <property>  
       <name>yarn.resourcemanager.hostname.rm1</name>  
       <value>yfbd-virtual-machine-01</value>  
    </property>  
    <property>  
       <name>yarn.resourcemanager.hostname.rm2</name>  
       <value>yfbd-virtual-machine-02</value>  
    </property>  
    <property>
       <name>yarn.resourcemanager.hostname.rm3</name>
       <value>yfbd-virtual-machine-03</value>
    </property> 
    <!--指定zookeeper集群的地址-->   
    <property>  
       <name>yarn.resourcemanager.zk-address</name>  
        <value>yfbd-virtual-machine-01:2181,yfbd-virtual-machine-02:2181,yfbd-virtual-machine-03:2181</value>  
    </property>  
    <!--启用自动恢复,当任务进行一半,rm坏掉,就要启动自动恢复,默认是false-->   
    <property>  
       <name>yarn.resourcemanager.recovery.enabled</name>  
       <value>true</value>  
    </property>  
   
    <!--指定resourcemanager的状态信息存储在zookeeper集群,默认是存放在FileSystem里面。-->   
    <property>  
       <name>yarn.resourcemanager.store.class</name>  
     <value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value> 
    </property> 
    
    <!-- 环境变量的继承 -->
    <property>
        <name>yarn.nodemanager.env-whitelist</name>
        <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
    </property>
    
    <!-- 开启日志聚集功能 -->
    <property>
        <name>yarn.log-aggregation-enable</name>
        <value>true</value>
    </property>

    <!-- 设置日志聚集服务器地址 -->
    <property>  
        <name>yarn.log.server.url</name>  
        <value>http://yfbd-virtual-machine-01:19888/jobhistory/logs</value>
    </property>

    <!-- 设置日志保留时间为7天 -->
    <property>
        <name>yarn.log-aggregation.retain-seconds</name>
        <value>604800</value>
    </property>
    <property>  
       <name>yarn.resourcemanager.ha.id</name>  
       <value>rm1</value>  
    </property>
    <property>
        <name>yarn.nodemanager.pmem-check-enabled</name>
        <value>false</value>
    </property>
    <property>
        <name>yarn.nodemanager.vmem-check-enabled</name>
        <value>false</value>
    </property>
</configuration>

4.配置workers

yfbd-virtual-machine-01
yfbd-virtual-machine-02
yfbd-virtual-machine-03

5.格式化namenode

cd /home/yfbd/bigdata/hadoop-3.1.4/bin
./hadoop namenode -format

6.开启集群

cd /home/yfbd/bigdata/hadoop-3.1.4/sbin
./start-all.sh

hive 部署

1.配置环境变量

export HADOOP_HOME=/home/yfbd/bigdata/hadoop-3.1.4
export PATH=$PATH:$HADOOP_HOME/bin
export PATH=$PATH:$HADOOP_HOME/sbin

export HADOOP_MAPRED_HOME=${HADOOP_HOME}
export HADOOP_COMMON_HOME=${HADOOP_HOME}
export HADOOP_HDFS_HOME=${HADOOP_HOME}
export HADOOP_YARN_HOME=${HADOOP_HOME}

export HADOOP_CONF_DIR=/home/yfbd/bigdata/hadoop-3.1.4/etc/hadoop
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin

export ZOOKEEPER_HOME=/home/yfbd/bigdata/zookeeper
export PATH=$PATH:$ZOOKEEPER_HOME/bin

export HIVE_HOME=/home/yfbd/bigdata/hive3.1.2
export PATH=$PATH:$HIVE_HOME/bin

2.配置hive-site.xml

<configuration>
    <property>
        <name>hive.metastore.warehouse.dir</name>
        <value>/user/hive_remote/warehouse</value>
        <description>设置hdfs中的默认目录</description>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionURL</name>
        <value>jdbc:mysql://10.216.3.17:3306/hive3_remote?createDatabaseIfNotExist=true&amp;useSSL=false&amp;allowPublicKeyRetrieval=true</value>
        <description>保存元数据的数据库连接</description>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionDriverName</name>
        <value>com.mysql.cj.jdbc.Driver</value>
        <description>数据库驱动,需要拷贝到${HIVE_HOME}/lib目录</description>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionUserName</name>
        <value>hive3</value>
        <description>用户名和密码</description>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionPassword</name>
        <value>123456</value>
        <description>用户名和密码</description>
    </property>
    <property>
        <name>hive.cli.print.header</name>
        <value>true</value>
    </property>
    <property>
        <name>hive.cli.print.current.db</name>
        <value>true</value>
    </property>
    <property> 
        <name>spark.home</name> 
        <value>/home/yfbd/bigdata/spark-3.1.3-bin-hadoop3.2</value> 
    </property>
    <property>
        <name>hive.aux.jars.path</name>
        <value>file:///home/yfbd/bigdata/hive3.1.2/lib</value>
    </property>
</configuration>
<configuration>
    <property>
        <name>hive.metastore.uris</name>
        <value>thrift://yfbd-virtual-machine-01:9083</value>
        <description>metastore地址</description>
    </property>
</configuration>

3.初始化hive元数据

cd /home/yfbd/bigdata/hive3.1.2/bin
schematool -dbType mysql -initSchema

4.启动Metastore服务

hive --service metastore

spark master-slave部署

1.下载spark包

https://mirrors.tuna.tsinghua.edu.cn/apache/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz

2.解压

tar -zxvf spark-3.1.2-bin-hadoop3.2.tgz

3.添加Hadoop配置文件软链接

ln -s /home/yfbd/bigdata/hadoop-3.1.4/etc/hadoop/core-site.xml
ln -s /home/yfbd/bigdata/hadoop-3.1.4/etc/hadoop/hdfs-site.xml

4.添加hive-site.xml配置文件

vim hive-site.xml
<configuration>
    <property>
        <name>hive.metastore.warehouse.dir</name>
        <value>/user/hive_remote/warehouse</value>
        <description>设置hdfs中的默认目录</description>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionURL</name>
        <value>jdbc:mysql://10.216.3.17:3306/hive3_remote?createDatabaseIfNotExist=true&amp;useSSL=false&amp;allowPublicKeyRetrieval=true</value>
        <description>保存元数据的数据库连接</description>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionDriverName</name>
        <value>com.mysql.cj.jdbc.Driver</value>
        <description>数据库驱动,需要拷贝到${HIVE_HOME}/lib目录</description>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionUserName</name>
        <value>hive3</value>
        <description>用户名和密码</description>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionPassword</name>
        <value>123456</value>
        <description>用户名和密码</description>
    </property>
    <property>
        <name>hive.cli.print.header</name>
        <value>true</value>
    </property>
    <property>
        <name>hive.cli.print.current.db</name>
        <value>true</value>
    </property>
    <property> 
        <name>spark.home</name> 
        <value>/home/yfbd/bigdata/spark-3.1.3-bin-hadoop3.2</value> 
    </property>
    <property>
        <name>hive.aux.jars.path</name>
        <value>file:///home/yfbd/bigdata/hive3.1.2/lib</value> 
    </property>
</configuration>

5.编辑spark-env.sh

export JAVA_HOME=/home/yfbd/bigdata/jdk1.8
export HADOOP_HOME=/home/yfbd/bigdata/hadoop-3.1.4
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export SPARK_MASTER_HOST=yfbd-virtual-machine-01
export SPARK_MASTER_WEBUI_PORT=8060
export SPARK_WORKER_WEBUI_PORT=8061
export SPARK_WORKER_MEMORY=500m
export SPARK_WORKER_CORES=1
export SPARK_DAEMON_JAVA_OPTS="-Dspark.deploy.recoveryMode=ZOOKEEPER
 -Dspark.deploy.zookeeper.url=yfbd-virtual-machine-01:2181,yfbd-virtual-machine-02:2181,yfbd-virtual-machine-03:2181
 -Dspark.deploy.zookeeper.dir=/opt/hadoop/data/zookeeper/spark"
export HIVE_HOME=/home/yfbd/bigdata/hive3.1.2
export YARN_CONF_DIR=/home/yfbd/bigdata/hadoop-3.1.4/etc/hadoop

6.编辑workers

yfbd-virtual-machine-01
yfbd-virtual-machine-02
yfbd-virtual-machine-03

7.编辑spark-defaults.conf

spark.sql.hive.metastore.version        3.1.2
spark.sql.hive.metastore.jars           path
spark.sql.hive.metastore.jars.path      file:///home/yfbd/bigdata/hive3.1.2/lib/*.jar
spark.sql.uris                          thrift://yfbd-virtual-machine-01:9083

8.启动spark

cd /home/yfbd/bigdata/spark-3.1.3-bin-hadoop3.2/sbin
./start-all.sh

kyuubi部署

1.下载kyuubi包,解压

https://dlcdn.apache.org/incubator/kyuubi/kyuubi-1.5.1-incubating/apache-kyuubi-1.5.1-incubating-bin.tgz
tar -zxvf apache-kyuubi-1.5.1-incubating-bin.tgz

2.配置kyuubi-defaults.conf

cp kyuubi-defaults.conf.template kyuubi-defaults.conf

kyuubi.ha.zookeeper.quorum=yfbd-virtual-machine-01:2181,yfbd-virtual-machine-02:2181,yfbd-virtual-machine-03:2181
kyuubi.authentication=NONE
kyuubi.engine.share.level=USER
kyuubi.frontend.bind.host=0.0.0.0
kyuubi.frontend.bind.port=10009
kyuubi.ha.zookeeper.namespace=kyuubi
kyuubi.session.engine.idle.timeout=PT10H

spark.master=yarn
spark.submit.deployMode=cluster
spark.dynamicAllocation.enabled=true
spark.dynamicAllocation.minExecutors=0
spark.dynamicAllocation.maxExecutors=20
spark.dynamicAllocation.executorIdleTimeout=60
spark.shuffle.service.enabled=true

3.配置kyuubi-env.sh

cp kyuubi-env.sh.template kyuubi-env.sh

export JAVA_HOME=/home/yfbd/bigdata/jdk1.8
export SPARK_HOME=/home/yfbd/bigdata/spark-3.1.3-bin-hadoop3.2
export SPARK_CONF_DIR=${SPARK_HOME}/conf
export HADOOP_CONF_DIR=/home/yfbd/bigdata/hadoop-3.1.4/etc/hadoop
export KYUUBI_MAX_LOG_FILES=10

4.配置hive-site.xml

cp /home/yfbd/bigdata/hive3.1.2/conf/hive-site.xml /home/yfbd/bigdata/apache-kyuubi-1.5.1-incubating-bin/conf/

5.启动kyuubi

cd /home/yfbd/bigdata/apache-kyuubi-1.5.1-incubating-bin/bin
./kyuubi start
上一篇 下一篇

猜你喜欢

热点阅读