Python处理千万行数据的过程和思路.md

2020-08-08  本文已影响0人  平凡的运维之路

事情的来源

现有阶段实现方式
#!/bin/python
import redis

def FileToList(Filename):
    File1 = []
    print(Filename,'---->')
    with open(Filename,'r')  as  File:
        for line  in  File.readlines():
            NewLine = line.strip()
            File1.append(NewLine)
    return File1

def FileToSplistList(Filename):
    File1 = []
    print(Filename,'---->')
    with open(Filename,'r')  as  File:
        for line  in  File.readlines():
            NewLine = line.strip().split("/")[-1]
            File1.append(NewLine)
    return File1

def HandleIndex(SqlList,RecordList):
    #print(SqlList)
    for Sqltxt in SqlList:
        RecordName = Sqltxt.split(",")[0].split("/")[-1]
        Entid = Sqltxt.split(",")[0].split("/")[1]
        KeyName =  "RecordName" + Entid
        IfRedisMsg = client.sismember(KeyName,RecordName)
        if False == IfRedisMsg and RecordName in RecordList:
            KeyName =  "RecordName" + Entid
            client.sadd(KeyName,RecordName)
            result = client.sadd('RecordList', RecordName)
            Writefile.writelines(Sqltxt + "\n")
        else:
            #print(Sqltxt,"----->")
            ChayiWritefile.writelines(Sqltxt + "\n")

if __name__ == '__main__':
    client = redis.StrictRedis(host='localhost', port=6379, db=0)
    #RedisCli =  redis.StrictRedis(connection_pool = client)

    recordfiles = './Xsfiles/0211270039.txt'
    RecordIndexs = './DsRecordIndex/0211270039.txt'
    Entid = "0211270039"
    filename = Entid + "_sql.txt"
    Chayifilename = Entid + "_ChaYi_sql.txt"
    Writefile = open(filename, 'a')
    ChayiWritefile = open(Chayifilename, 'a')

    newSqlList =  FileToList(RecordIndexs)
    newRecordList =  FileToSplistList(recordfiles)
    HandleIndex(newSqlList,newRecordList)

    Writefile.close()
    ChayiWritefile.close()

改造实现方式
#使用的新的方式循环第一个文件列表去redis中判断,是否存在,1秒处理有4K多的数据
[root@xxxx NewRun]# wc -l 0211270052_sql.txt ; sleep 1 ; wc -l 0211270052_sql.txt 
293040 0211270052_sql.txt
297450 0211270052_sql.txt

#而使用Python两个文件list循环每秒有20多条的数据处理,而这个两个列表的数据量只有350W的数据量。
[root@xxxx XianShanData]# wc -l 0211270039_sql.txt   ; sleep 1;  wc -l 0211270039_sql.txt
3166737 0211270039_sql.txt
3166759 0211270039_sql.txt

[root@xxxx NewRun]# redis-cli 
127.0.0.1:6379> scard  0211270052_RecordFile_List
(integer) 15619783

127.0.0.1:6379> KEYS *
1) "0211270052_RecordIndex_List"
2) "0211270052_RecordFile_List"
3) "test001"

127.0.0.1:6379> del test001
(integer) 1

127.0.0.1:6379> sscan 0211270053_RecordFile_List 1
1) "2305"
2)  1) "TEL-18683047475_8000662250_20190108101557.wav"

#!/bin/python
#-*- coding:utf-8 -*-

import redis

"""
1.新增就是把上面录音文件的读取出来,放在redis库中,然后循环索引是否在redis库zset集合里面,然后根据返回值判断是否能对应上
2.2020年8月8日16:42:18

"""

def FileToList(Filename):
    File1 = []
    print(Filename,'---->')
    with open(Filename,'r')  as  File:
        for line  in  File.readlines():
            NewLine = line.strip()
            File1.append(NewLine)
    return File1

def FileToSplistList(Filename):
    File1 = []
    print(Filename,'---->')
    with open(Filename,'r')  as  File:
        for line  in  File.readlines():
            NewLine = line.strip().split("/")[-1]
            File1.append(NewLine)
    return File1


def FileToSplistToRedisSet(Filename,KeyName):
    print(Filename,'---->')
    with open(Filename,'r')  as  File:
        for line  in  File.readlines():
            RecordName = line.strip().split("/")[-1]
            client.sadd(KeyName,RecordName)

def HandleIndex(SqlList,RecordFileKeyName,RecordListKeyName):
    for Sqltxt in SqlList:
        RecordName = Sqltxt.split(",")[0].split("/")[-1]
        Entid = Sqltxt.split(",")[0].split("/")[1]
        KeyName =  "RecordName" + Entid
        #判断是在redis的set集合中
        IfRedisIndexRecord = client.sismember(RecordListKeyName,RecordName)
        IfRedisRecordFiles = client.sismember(RecordFileKeyName,RecordName)
        if False == IfRedisIndexRecord and True == IfRedisRecordFiles:
            client.sadd(RecordListKeyName,RecordName)
            Writefile.writelines(Sqltxt + "\n")
        else:
            #print(Sqltxt,"----->")
            ChayiWritefile.writelines(Sqltxt + "\n")

if __name__ == '__main__':
    client = redis.StrictRedis(host='localhost', port=6379, db=0)

    recordfiles = '/home/record/XianShanData/Xsfiles/0211270052.txt'
    RecordIndexs = '/home/record/XianShanData/DsRecordIndex/0211270052.txt'
    Entid = "0211270052"
    filename = Entid + "_sql.txt"
    Chayifilename = Entid + "_ChaYi_sql.txt"
    Writefile = open(filename, 'a')
    ChayiWritefile = open(Chayifilename, 'a')

    newSqlList =  FileToList(RecordIndexs)
    RecordFileKeyName =  Entid + "_RecordFile_List"
    RecordListKeyName =  Entid + "_RecordIndex_List"
    client.delete(RecordFileKeyName)
    client.delete(RecordListKeyName)
    FileToSplistToRedisSet(recordfiles,RecordFileKeyName)
    newRecordfileList =  FileToSplistList(recordfiles)
    HandleIndex(newSqlList,RecordFileKeyName,RecordListKeyName)

    Writefile.close()
    ChayiWritefile.close()

总结分析

上一篇 下一篇

猜你喜欢

热点阅读