人工智能

AI算力平台基础设施

2024-09-14  本文已影响0人  sknfie

概述

AI算力平台建立在centos基础上,通过docker容器化部署,利用kuberntets进行调度。

安装基础设施平台

安装docker

# 先卸载原有docker
service docker stop
rpm -qa | grep docker | xargs yum remove -y
rpm -qa | grep docker
rm -rf /usr/lib/systemd/system/docker.service

# 安装镜像源
yum install -y container-selinux  yum-utils 

yum-config-manager --add-repo http://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo 

yum update -y
# 查看可用版本
yum list docker-ce --showduplicates
# 安装指定版本,使用安装指定版本
yum install -y docker-ce
#yum install -y docker-ce-26.1.3-1.el8
#yum install -y docker-ce-26.1.3-1.el9

systemctl start docker

yum安装k8s的源

cat <<EOF > /etc/yum.repos.d/kubernetes.repo
[kubernetes]
name=Kubernetes
baseurl=https://mirrors.aliyun.com/kubernetes/yum/repos/kubernetes-el7-x86_64/
enabled=1
gpgcheck=1
repo_gpgcheck=1
gpgkey=https://mirrors.aliyun.com/kubernetes/yum/doc/yum-key.gpg https://mirrors.aliyun.com/kubernetes/yum/doc/rpm-package-key.gpg
EOF
setenforce 0
yum install -y kubectl-1.24.0 
source <(kubectl completion bash)

centos系统初始化

#修改/etc/firewalld/firewalld.conf
#FirewallBackend=nftables
#FirewallBackend=iptables

yum install -y yum-utils device-mapper-persistent-data lvm2
yum install -y iptables container-selinux iptables-services
# 加载内核模块
(
cat << EOF

systemctl stop firewalld
systemctl disable firewalld
systemctl stop iptables
systemctl disable iptables
systemctl stop ip6tables
systemctl disable ip6tables
systemctl stop nftables
systemctl disable nftables

modprobe br_netfilter 
modprobe ip_tables 
modprobe iptable_nat 
modprobe iptable_filter 
modprobe iptable_mangle 
modprobe iptable_mangle
modprobe ip6_tables 
modprobe ip6table_nat 
modprobe ip6table_filter 
modprobe ip6table_mangle 
modprobe ip6table_mangle

EOF
)>>  /etc/rc.d/rc.local
chmod +x /etc/rc.d/rc.local
sh /etc/rc.d/rc.local
# 查看加载的内核模块
lsmod
sudo echo 'ip_tables' >> /etc/modules


systemctl status iptables
systemctl status ip6tables
systemctl status nftables
systemctl status firewalld

modinfo iptable_nat
modinfo ip6table_nat

echo "net.bridge.bridge-nf-call-ip6tables = 1" >> /etc/sysctl.conf
echo "net.bridge.bridge-nf-call-iptables=1" >> /etc/sysctl.conf
echo "net.ipv4.ip_forward=1" >> /etc/sysctl.conf
echo "1" >/proc/sys/net/bridge/bridge-nf-call-iptables
sysctl -p

systemctl restart docker

reboot

# ipv6相关错误可以忽略
# 查看模块
# ls /lib/modules/`uname -r`/kernel/net/ipv6/netfilter/

部署rancher server

docker pull rancher/kube-api-auth:v0.2.1  &
docker pull rancher/mirrored-coredns-coredns:1.9.4  &
docker pull rancher/mirrored-coreos-etcd:v3.5.9  &
docker pull rancher/rancher:v2.8.5  &
docker pull rancher/rancher-webhook:v0.4.7  &
docker pull rancher/mirrored-flannelcni-flannel:v0.19.2  &
docker pull rancher/mirrored-cluster-proportional-autoscaler:1.8.6  &
docker pull rancher/calico-cni:v3.26.3-rancher1  &
docker pull rancher/mirrored-metrics-server:v0.6.2  &
docker pull rancher/shell:v0.1.24  &
docker pull rancher/rke-tools:v0.1.96  &
docker pull rancher/mirrored-calico-node:v3.26.3  &
docker pull rancher/mirrored-pause:3.7  &
docker pull rancher/hyperkube:v1.25.16-rancher2  &
docker pull rancher/rancher-agent:v2.8.5  &
docker pull rancher/mirrored-calico-kube-controllers:v3.26.3  &

wait
export RANCHER_CONTAINER_TAG=v2.8.5

sudo docker run -d --privileged --restart=unless-stopped -p 443:443 --name=myrancher -e AUDIT_LEVEL=3 rancher/rancher:$RANCHER_CONTAINER_TAG
# 打开 https://xx.xx.xx.xx:443/ 等待web界面可以打开。预计要1~10分钟
# 在主机上执行一下命令 查看登陆密码
docker logs  myrancher  2>&1 | grep "Bootstrap Password:"

部署k8s集群

通过rancher部署k8s集群

上一篇 下一篇

猜你喜欢

热点阅读