记springboot程序OOM排查和解决过程

2021-11-10  本文已影响0人  _Kantin

背景

排查过程

${JAVA_EXEC} -server -XX:+UseG1GC -Xmx8G -Xms8G -Xss256k -XX:+HeapDumpOnOutOfMemoryError  -XX:HeapDumpPath=/var/log/xxx  -XX:MaxGCPauseMillis=300 -Xloggc:/var/log/xxx/xxx_gc.log  -XX:+PrintGCTimeStamps -XX:+PrintGCDetails   -Dservice.name=${EXEC_COMMEN} -Dfastjson.parser.safeMode=true   -cp "${FULL_PATH_SERVER_JAR}:${LIB_PATH}/*:${CONFIG_VERSION_PATH}/" ${MAIN_CLASS} >> ${LOG_FILE} 2>&1 &

代码排查和修改

 public FileSystem getFileSystemInstance(Cluster cluster){
        FileSystem fs = null;
        HadoopClusterParam param = JSONObject.parseObject(cluster.getParam(), HadoopClusterParam.class);
        Configuration conf = new Configuration();
        System.setProperty("java.security.krb5.conf", param.getKrb5Conf());
        conf.set("dfs.namenode.kerberos.principal", param.getHdfsKerberosPrincipal());
        conf.set("dfs.namenode.kerberos.principal.pattern", "*");
        conf.set("hadoop.security.authentication", "kerberos");
        conf.set("fs.trash.interval", "1");
        conf.set("fs.defaultFS", String.format("hdfs://%s", param.getHaName()));
        conf.set(String.format("dfs.client.failover.proxy.provider.%s", param.getHaName()), "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");
        conf.set(String.format("dfs.ha.namenodes.%s", param.getHaName()), param.getHaNamenodes());
        String[] nns = param.getHaNamenodes().split(",");
        String[] nnHosts = param.getNamenodeAddress().split(",");
        conf.set(String.format("dfs.namenode.rpc-address.%s.%s", param.getHaName(), nns[0]), String.format("%s:8020", nnHosts[0]));
        conf.set(String.format("dfs.namenode.rpc-address.%s.%s", param.getHaName(), nns[1]), String.format("%s:8020", nnHosts[1]));
        conf.set("dfs.nameservices", param.getHaNames());
        try {
            UserGroupInformation.setConfiguration(conf);
            UserGroupInformation.loginUserFromKeytab(param.getHdfsKerberosPrincipal(), param.getHdfsKerberosKeytab());
            fs =  FileSystem.get(conf);
        } catch (Exception e) {
            LOGGER.error("Build FileSystem found execption,caused by:", e);
        }
        return fs;
    }
    private Map<String,Configuration> confMap = new HashMap<>();

    private Configuration generateFileSystemConf(Cluster cluster) throws IOException {

        UserGroupInformation currentUser = UserGroupInformation.getCurrentUser();
        String userName = currentUser.getUserName();
        //防止其它操作更新掉当前线程中的kerberos认证用户
        if(!HDFS_USER.equals(userName)){
            LOGGER.info("The login user has changed,current user:{},change to {}",userName,HDFS_USER);
            confMap.remove(cluster.getClusterName());
        }
        if(confMap.getOrDefault(cluster.getClusterName(),null)==null){
            Configuration conf = new Configuration();
            HadoopClusterParam param = JSONObject.parseObject(cluster.getParam(), HadoopClusterParam.class);
            System.setProperty("java.security.krb5.conf", param.getKrb5Conf());
            conf.set("dfs.namenode.kerberos.principal", param.getHdfsKerberosPrincipal());
            conf.set("dfs.namenode.kerberos.principal.pattern", "*");
            conf.set("hadoop.security.authentication", "kerberos");
            conf.set("fs.trash.interval", "1");
            conf.set("fs.defaultFS", String.format("hdfs://%s", param.getHaName()));
            conf.set(String.format("dfs.client.failover.proxy.provider.%s", param.getHaName()), "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");
            conf.set(String.format("dfs.ha.namenodes.%s", param.getHaName()), param.getHaNamenodes());
            String[] nns = param.getHaNamenodes().split(",");
            String[] nnHosts = param.getNamenodeAddress().split(",");
            conf.set(String.format("dfs.namenode.rpc-address.%s.%s", param.getHaName(), nns[0]), String.format("%s:8020", nnHosts[0]));
            conf.set(String.format("dfs.namenode.rpc-address.%s.%s", param.getHaName(), nns[1]), String.format("%s:8020", nnHosts[1]));
            conf.set("dfs.nameservices", param.getHaNames());
            UserGroupInformation.setConfiguration(conf);
            UserGroupInformation.loginUserFromKeytab(param.getHdfsKerberosPrincipal(), param.getHdfsKerberosKeytab());
            confMap.put(cluster.getClusterName(),conf);
            return conf;
        }
        return confMap.get(cluster.getClusterName());
    }

    public FileSystem getFileSystemInstance(Cluster cluster){
        FileSystem fs = null;
        try {
            Configuration conf = this.generateFileSystemConf(cluster);
            fs =  FileSystem.get(conf);
        } catch (Exception e) {
            LOGGER.error("Build FileSystem found execption,caused by:", e);
        }
        return fs;
    }

后记

  public static FileSystem get(URI uri, Configuration conf) throws IOException {
    String scheme = uri.getScheme();
    String authority = uri.getAuthority();

    if (scheme == null && authority == null) {     // use default FS
      return get(conf);
    }

    if (scheme != null && authority == null) {     // no authority
      URI defaultUri = getDefaultUri(conf);
      if (scheme.equals(defaultUri.getScheme())    // if scheme matches default
          && defaultUri.getAuthority() != null) {  // & default has authority
        return get(defaultUri, conf);              // return default
      }
    }
    
    String disableCacheName = String.format("fs.%s.impl.disable.cache", scheme);
    if (conf.getBoolean(disableCacheName, false)) {
      return createFileSystem(uri, conf);
    }

    return CACHE.get(uri, conf);
  }
上一篇 下一篇

猜你喜欢

热点阅读