java大数据安全

大数据安全--敏感数据识别和分级打标

2019-11-12  本文已影响0人  猪蹄胖

一、信息安全技术政务信息共享数据安全技术要求

数据安全技术要求
共享数据提供方在进行数据分级分类时的安全要求包括:


二、敏感数据识别和分级打标

数据分级分类的原则

数据分级分类方式

敏感数据识别

通过用户自定义规则,自动识别敏感数据
使用自带的规则或自定义规则,对其结构化表或者非结构化文件进行整体扫描、分级

image.png

三、敏感数据自动识别实现

3.1、敏感字段标注方案

敏感字段包括:
统一社会信用代码,车辆识别代码,营业执照号码,税务登记证号码,组织机构代码,图片,日期,IP地址,MAC地址,城市,性别,民族,省份,车牌号,电话号码,军官证,邮箱,护照号,港澳通行证,姓名,地址,手机号,身份证,银行卡

发现敏感字段方法

3.2、敏感字段识别

识别方式:正则匹配关键字算法

数据识别问题

四、demo代码

4.1、识别mysql数据库中手机号码字段

对指定的mysql实例下的所有库、所有表、所有字段,遍历去匹配正则表达式,然后进行标记。

# -*- coding:utf-8 -*-

"""
@Author     :   Browser
@file       :   identity_mysql.py 
@time       :   2019/09/30
@software   :   PyCharm 
@description:   " "
"""

import pymysql
import re

s1 = "无风险"
s2 = "低风险"
s3 = "中风险"
s4 = "高风险"

# 通过正则匹配出个人手机号码
def check_secret(value):
    phone_pattern = '^[1](([3][0-9])|([4][5-9])|([5][0-3,5-9])|([6][5,6])|([7][0-8])|([8][0-9])|([9][1,8,9]))[0-9]{8}$'
    if re.match(phone_pattern, value):
        return ('%s' % s3)
    else:
        return ('%s' % s1)
        
class DB(object):
    def __init__(self,ip,username,password):
        self.ip = ip
        self.username = username
        self.password = password
        self.db = pymysql.connect(self.ip,self.username,self.password)
        self.cursor = self.db.cursor()

    #  通过schemata获取所有数据库名称
    def get_database(self):
        self.cursor.execute("SELECT schema_name from information_schema.schemata ")
        database_list = self.cursor.fetchall()
        result = []
        for line in database_list:
            if line[0] not in ['information_schema','mysql','performance_schema','sys','loonflownew']:   #排除默认的数据库
                result.append(line[0])
        return result

    #  获取表名
    def get_table(self,database):
        self.cursor.execute("select table_name from information_schema.tables where table_schema= '%s' " % database)
        table_list = self.cursor.fetchall()
        result = []
        for line in table_list:
            result.append(line[0])
        return result

    #  获取字段名
    def get_column(self,database,table):
        self.cursor.execute("select column_name from information_schema.columns where table_schema='%s' and table_name='%s'" % (database,table))
        column_list = self.cursor.fetchall()
        result = []
        for line in column_list:
            result.append(line[0])
        return result

    #  获取字段内容
    def get_content(self,database,table,column):
        self.cursor.execute("select %s from %s.%s LIMIT 0,1" %(column,database,table))
        content = self.cursor.fetchall()
        if content:
            return content[0][0]

    def __del__(self):
        self.db.close()

if __name__ == '__main__':
        # db = DB('192.168.189.154','root','Gepoint')
        db = DB('rm-bp1i3518ykiqi60my8o.mysql.rds.aliyuncs.com','root','Epoint@123@)!(')
        databases = db.get_database()
        for database in databases:
            tables = db.get_table(database)
            for table in tables:
                columns = db.get_column(database,table)
                for column in columns:
                    data = db.get_content(database,table,column)
                    data_str = str(data)
                    result = [database,table,column,data_str,check_secret(data_str)]
                    result_str = str(result) + "\r\n"
                    with open('message.txt','a+',encoding='UTF-8') as file:
                        file.write(result_str)

4.2、敏感数据识别规则

IP地址: 正则表达式
#精确匹配IP地址
def check_ip(value):
    ip_pattern = r'^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$'
    if re.match(ip_pattern, value):
        print('%s' % s2)
    else:
        print('%s' % s1)
MAC地址: 正则表达式
#精确匹配MAC地址
def check_mac(value):
    mac_pattern = r'^(?:(?:(?:[a-f0-9A-F]{2}:){5})|(?:(?:[a-f0-9A-F]{2}-){5}))[a-f0-9A-F]{2}$'
    if re.match(mac_pattern, value):
        print('%s' % s2)
    else:
        print('%s' % s1)
IPv6地址: 正则表达式
def check_ipv6(value):
    ipv6_pattern = r'^\s*((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?\s*$'
    if re.match(ipv6_pattern, value):
        print('%s' % s2)
    else:
        print('%s' % s1)
手机号: 正则表达式
def check_phone(value):
    phone_pattern = r'^[1](([3][0-9])|([4][5-9])|([5][0-3,5-9])|([6][5,6])|([7][0-8])|([8][0-9])|([9][1,8,9]))[0-9]{8}$'
    if re.match(phone_pattern, value):
        print('%s' % s3)
    else:
        print('%s' % s1)
银行卡: 算法
def check_bank_card(card_num):
    total = 0
    card_num_length = len(card_num)
    for item in range(1, card_num_length + 1):
        t = int(card_num[card_num_length - item])
        if item % 2 == 0:
            t *= 2
            total += t if t < 10 else t % 10 + t // 10
        else:
            total += t
    return total % 10 == 0
身份证: 算法
def check_IDNumber(value):
    str_to_int = {'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5,
                  '6': 6, '7': 7, '8': 8, '9': 9, 'X': 10}
    check_dict = {0: '1', 1: '0', 2: 'X', 3: '9', 4: '8', 5: '7',
                  6: '6', 7: '5', 8: '4', 9: '3', 10: '2'}
    if len(value) != 18:
        raise TypeError(u'请输入标准的第二代身份证号码')
    check_num = 0
    for index, num in enumerate(value):
        if index == 17:
            verify_code = check_dict.get(check_num % 11)
            if num == verify_code:
                print(u"身份证号: %s, 校验通过," % value + s4)
            else:
                print(u"身份证号: %s, 校验不通过, 正确尾号应该为:%s," % (value, verify_code) + s1)
        check_num += str_to_int.get(num) * (2 ** (17 - index) % 11)
地址:自然语言处理工具包(CRF)
import re,sys
from pyhanlp import *

s1 = "无风险"
s2 = "低风险"
s3 = "中风险"
s4 = "高风险"

value = sys.argv[1]
def check_chinese_address_recognition(value):
    CRFnewSegment = HanLP.newSegment("crf")
    address_list = CRFnewSegment.seg(value)
    dict = {}
    for i in address_list:
        dict[str(i.word)] = [str(i.nature)]
    Address = r'(ns|nsf)'
    for key,value in dict.items():
        value = str(value)
        if re.search(Address,value):
            print('地址:%s' % key + ',风险等级:' + s3)
        else:
            print('常规词:%s' % key + ',风险等级:' + s1)

if __name__ == "__main__":
    check_chinese_address_recognition(value)
image.png
姓名:自然语言处理工具包(CRF)
import sys,re
from pyhanlp import *

s1 = "无风险"
s2 = "低风险"
s3 = "中风险"
s4 = "高风险"
value = sys.argv[1]
def check_chinese_name_recognition(value):
    CRFnewSegment = HanLP.newSegment("crf")
    name_list = CRFnewSegment.seg(value)
    dict = {}
    for i in name_list:
        dict[str(i.word)] = [str(i.nature)]
    Person_Name = r'nr'
    for key,value in dict.items():
        result = str(value)
        if re.search(Person_Name,result):
            print('姓名:%s' % key + ',风险等级:' + s4)
        else:
            print('常规词:%s' % key + ',风险等级:' + s1)
if __name__ == "__main__":
    check_chinese_name_recognition(value)
image.png
性别: 正则表达式
def check_gender(value):
    gender_pattern = r'^((男|male)|(女|female))$'
    if re.match(gender_pattern,value):
        print('%s' % s2)
    else:
        print('%s' % s1)
民族: 正则表达式
def check_national(value):
    national_pattern = r'^((汉|满|蒙古|回|藏|维吾尔|苗|彝|壮|布依|侗|瑶|白|土家|哈尼|哈萨克|傣|黎' \
                       r'|傈僳|佤|畲|高山|拉祜|水|东乡|纳西|景颇|柯尔克孜|土|达斡尔|仫佬|羌|布朗' \
                       r'|撒拉|毛南|仡佬|锡伯|阿昌|普米|朝鲜|塔吉克|怒|乌孜别克|俄罗斯|鄂温克|德昂' \
                       r'|保安|裕固|京|塔塔尔|独龙|鄂伦春|赫哲|门巴|珞巴|基诺)' \
                       r'|(汉族|满族|蒙古族|回族|藏族|维吾尔族|苗族|彝族|壮族|布依族|侗族|瑶族|白族|' \
                       r'土家族|哈尼族|哈萨克族|傣族|黎族|傈僳族|佤族|畲族|高山族|拉祜族|水族|东乡族|' \
                       r'纳西族|景颇族|柯尔克孜族|土族|达斡尔族|仫佬族|羌族|布朗族|撒拉族|毛南族|仡佬族|' \
                       r'锡伯族|阿昌族|普米族|朝鲜族|塔吉克族|怒族|乌孜别克族|俄罗斯族|鄂温克族|德昂族|' \
                       r'保安族|裕固族|京族|塔塔尔族|独龙族|鄂伦春族|赫哲族|门巴族|珞巴族|基诺族))$'
    if re.match(national_pattern,value):
        print('%s' % s3)
    else:
        print('%s' % s1)
省份: 正则表达式
def check_provinces(value):
    provinces_pattern = r'^(北京市|天津市|上海市|重庆市|河北省|山西省|辽宁省|吉林省|黑龙江省|江苏省|' \
                        r'浙江省|安徽省|福建省|江西省|山东省|河南省|湖北省|湖南省|广东省|海南省|四川省|' \
                        r'贵州省|云南省|陕西省|甘肃省|青海省|台湾省|内蒙古自治区|广西壮族自治区|西藏自治区|'\
                        r'宁夏回族自治区|新疆维吾尔自治区|香港特别行政区|澳门特别行政区)$'
    if re.match(provinces_pattern,value):
        print('%s' % s2)
    else:
        print('%s' % s1)
车牌号: 正则表达式
def check_carnum(value):
    carnum_pattern = r'([京津沪渝冀豫云辽黑湘皖鲁新苏浙赣鄂桂甘晋蒙陕吉闽贵粤青藏川宁琼使领A-Z]' \
                     r'{1}[A-Z]{1}(([0-9]{5}[DF])|(DF[0-9]{4})))|' \
                     r'([京津沪渝冀豫云辽黑湘皖鲁新苏浙赣鄂桂甘晋蒙陕吉闽贵粤青藏川宁琼使领A-Z]' \
                     r'{1}[A-Z]{1}[A-HJ-NP-Z0-9]{4}[A-HJ-NP-Z0-9挂学警港澳]{1})'
    if re.match(carnum_pattern,value):
        print('%s' % s3)
    else:
        print('%s' % s1)
电话号码: 正则表达式
def check_telephone(value):
    telephone_pattern = r'^((0\d{2,3})-)(\d{7,8})|(\d{7,8})$'
    if re.match(telephone_pattern,value):
        print('%s' % s3)
    else:
        print('%s' % s1)
军官证:正则表达式
def check_officer(value):
    officer_pattern = r'^[\u4E00-\u9FA5](字第)([0-9a-zA-Z]{4,8})(号?)$'
    if re.match(officer_pattern,value):
        print('%s' % s3)
    else:
        print('%s' % s1)
邮箱: 正则表达式
def check_email(value):
    email_pattern = r'[\w-]+@[\w-]+(.[\w-]+)+'
    if re.match(email_pattern, value):
        print('%s' % s2)
    else:
        print('%s' % s1)
护照号: 正则表达式
def check_passport(value):
    passport_pattern = r'^([a-zA-z]|[0-9]){5,17}$'
    if re.match(passport_pattern,value):
        print('%s' % s3)
    else:
        print('%s' % s1)
港澳通行证: 正则表达式
def check_HM_pass(value):
    HM_pass_pattern = r'^[HMhm]{1}([0-9]{10}|[0-9]{8})$'
    if re.match(HM_pass_pattern, value):
        print('%s' % s3)
    else:
        print('%s' % s1)
JDBC连接串: 正则表达式
def check_jdbc(value):
    jdbc_pattern = r'^jdbc:(((microsoft:)?sqlserver:\/\/((25[0-5]|2[0-4]\d|[0-1]\d{2}|[1-9]?\d)\.(25[0-5]|2[0-4]\d|[0-1]\d{2}|[1-9]?\d)\.(25[0-5]|2[0-4]\d|[0-1]\d{2}|[1-9]?\d)\.(25[0-5]|2[0-4]\d|[0-1]\d{2}|[1-9]?\d)):(([1-9]([0-9]{0,3}))|([1-6][0-5][0-5][0-3][0-5]))(;[ \d\w\/=\?%\-&_~`@[\]\':+!]*)?)|' \
                   r'(oracle:thin:@((25[0-5]|2[0-4]\d|[0-1]\d{2}|[1-9]?\d)\.(25[0-5]|2[0-4]\d|[0-1]\d{2}|[1-9]?\d)\.(25[0-5]|2[0-4]\d|[0-1]\d{2}|[1-9]?\d)\.(25[0-5]|2[0-4]\d|[0-1]\d{2}|[1-9]?\d)):(([1-9]([0-9]{0,3}))|([1-6][0-5][0-5][0-3][0-5])):[A-Za-z0-9_]+)|' \
                   r'(mysql:\/\/((25[0-5]|2[0-4]\d|[0-1]\d{2}|[1-9]?\d)\.(25[0-5]|2[0-4]\d|[0-1]\d{2}|[1-9]?\d)\.(25[0-5]|2[0-4]\d|[0-1]\d{2}|[1-9]?\d)\.(25[0-5]|2[0-4]\d|[0-1]\d{2}|[1-9]?\d)):(([1-9]([0-9]{0,3}))|([1-6][0-5][0-5][0-3][0-5]))\/([A-Za-z0-9_]+)(\?([\d\w\/=\?%\-&_~`@[\]\':+!]*))?))$'
    if re.match(jdbc_pattern,value):
        print('%s' % s4)
    else:
        print('%s' % s1)
日期:正则表达式
def check_datetime(value):
    datatime_pattern = r'((((19|20)\d{2})[-/](0?(1|[3-9])|1[012])[-/](0?[1-9]|[12]\d|30))|(((19|20)\d{2})[-/](0?[13578]|1[02])[-/]31)|' \
                       r'(((19|20)\d{2})[-/]0?2[-/](0?[1-9]|1\d|2[0-8]))|((((19|20)([13579][26]|[2468][048]|0[48]))|(2000))[-/]0?2[-/]29))' \
                       r'\s([0-1][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])$'
    if re.match(datatime_pattern, value):
        print('%s' % s2)
    else:
        print('%s' % s1)
车辆识别代码:正则表达式
def check_vin(value):
    vin_pattern = r'^[A-HJ-NPR-Z\\d]{8}[\dX][A-HJ-NPR-Z\d]{2}\d{6}$'
    if re.match(vin_pattern,value):
        print('%s' % s3)
    else:
        print('%s' % s1)
组织机构代码:算法
def check_organization(value):
    organization_str = value.upper().replace('-', '')
    organization_pattern = r'^[\dA-Z]{8}[X\d]$'
    if re.search(organization_pattern, organization_str, re.S):
        verify_code = [3, 7, 9, 10, 5, 8, 4, 2]
        verify_code = 11 - sum([int(
            (ord(organization_str[index]) - 55) if organization_str[index].isalpha() else organization_str[index]
        ) * verify_code[index] for index in range(8)]) % 11
        verify_code = 'X' if verify_code == 10 else ('0' if verify_code == 11 else str(verify_code))
        if verify_code == organization_str[-1]:
            print('%s' % s3)
        else:
            print('%s' % s1)
    else:
        print('%s' % s1)
营业执照号码 :算法
def check_business(value):
    business_pattern = r'^\d{15}$'
    if re.search(business_pattern, value, re.S):
        verify_code = 10
        for index in range(14):
            verify_code = (((verify_code % 11 + int(value[index])) % 10 or 10) * 2) % 11
        verify_code = (11 - (verify_code % 10)) % 10
        if str(verify_code) == value[-1]:
            print('%s' % s3)
        else:
            print('%s' % s1)
    else:
        print('%s' % s1)
统一社会信用代码:算法
def check_credit(value):
    credit_str = value.upper()
    credit_pattern = r'^(1[129]|5[1239]|9[123]|Y1)\d{6}[\dA-Z]{8}[X\d][\dA-Z]$'
    if len(credit_str) != 18:
        return False
    search = re.search(credit_pattern, credit_str, re.S)
    if search:
        # if check_organization(xinyong_str[8:17]):
            str_to_num = {
                'A': 10, 'B': 11, 'C': 12, 'D': 13, 'E': 14, 'F': 15, 'G': 16, 'H': 17, 'J': 18, 'K': 19,
                'L': 20, 'M': 21, 'N': 22, 'P': 23, 'Q': 24, 'R': 25, 'T': 26, 'U': 27, 'W': 28, 'X': 29, 'Y': 30}
            num_to_str = {
                10: 'A', 11: 'B', 12: 'C', 13: 'D', 14: 'E', 15: 'F', 16: 'G', 17: 'H', 18: 'J', 19: 'K',
                20: 'L', 21: 'M', 22: 'N', 23: 'P', 24: 'Q', 25: 'R', 26: 'T', 27: 'U', 28: 'W', 29: 'X', 30: 'Y'}
            verify_code = [1, 3, 9, 27, 19, 26, 16, 17, 20, 29, 25, 13, 8, 24, 10, 30, 28]
            verify_code = 31 - sum([(str_to_num.get(credit_str[index], 0) if credit_str[index].isalpha() else int(credit_str[index])
                                     ) * verify_code[index] for index in range(17)]) % 31
            verify_code = num_to_str.get(verify_code, '') if verify_code > 9 else verify_code
            if verify_code == credit_str[-1]:
                print('%s' % s3)
            else:
                print('%s' % s1)
    else:
        print('%s' % s1)
上一篇下一篇

猜你喜欢

热点阅读