DataFrame练习

2018-12-11 本文已影响0人 dechuan

最近用python写了个小程序，顺便回顾一下语法，做个记录。

import os
print(os.getcwd()) # 打印当前工作目录
os.chdir('D:\warning data') # 将当前工作目录改变为`D:\warning data`
print(os.listdir())
df = pd.read_excel('alarm20180915-1130.xls')

2. 如何读取excel中的特定列特定值

SystemDf=df['FIRSTOCCURRENCE']

得到结果

0       2018-09-15 00:00:49
1       2018-09-15 00:39:05
2       2018-09-15 00:57:22
3       2018-09-15 01:00:55
4       2018-09-15 01:05:41
5       2018-09-15 01:05:41

#带限制条件选择列
SystemDf=df.loc[(df['STRINGA']=='P1')|(df['STRINGA']=='P2')|(df['STRINGA']=='P3'),['SUMMARY','FIRSTOCCURRENCE']] 
#选择一列中的每一行
for i in SystemDf.index:
    print(SystemDf['FIRSTOCCURRENCE'][i])

#选择某一列中特定值所对应的信息
PartUsage=df.loc[frame2['PARTID'] == '159']

得到结果

                         USAGE
PARTID FIRSTOCCURRENCE        
159    2018-09-15       0.7825
       2018-09-16       0.7880
       2018-09-17       0.7940
       2018-11-26       0.7000
       2018-11-27       0.7045
       2018-11-28       0.7115
       2018-11-29       0.7185
       2018-11-30       0.7220

3. 如何从原excel中抽取有用信息并存成新的excel

#因为后期会有大量调用操作，建议将从原excel中抽取的信息存成字典格式，再使用to_csv将其保存为csv，得到列名为字典key值，每一列为value的符合excel读取方式又便于后期程序调用的文件
import re
diction={}
diction['FIRSTOCCURRENCE']=[]
diction['TIME']=[]
diction['LPAR']=[]
diction['ID']=[]
diction['MessageID']=[]
diction['Message']=[]
outfile=open('outfile.csv','w',encoding = 'utf-8')
outlier=open('outlier.csv','w',encoding = 'utf-8')
filter='(.*?\d)\s+(CP[1-3][A-F])\s+([SJ]\S+)\s+(\S+[^\:])\s+(.*)' #最终版，把大部分的内容筛出来
#outfile.write('TIME;LPAR;ID;MessageID;Message;')
#outfile.write(u'\n')
for i in SystemDf.index:
    pattern=re.compile(filter)
    result=pattern.search(SystemDf['SUMMARY'][i])
    if result:
        FIRSTOCCURRENCE=SystemDf2['FIRSTOCCURRENCE'][i]
        Time= result.group(1)
        LPAR= result.group(2)
        ID=result.group(3)
        MessageID=result.group(4)
        Message=result.group(5)
        diction['FIRSTOCCURRENCE'].append(FIRSTOCCURRENCE)
        diction['TIME'].append(Time)
        diction['LPAR'].append(LPAR)
        diction['ID'].append(ID)
        diction['MessageID'].append(MessageID)
        diction['Message'].append(Message)
    else:
        outlier.write(SystemDf['SUMMARY'][i])
        outlier.write(u'\n')
#print('TIME;LPAR;ID;MessageID;Message',file=outfile)
frame=DataFrame(diction)
frame.to_csv(outfile,index=False,columns=['FIRSTOCCURRENCE','TIME','LPAR','ID','MessageID','Message'])

outfile.close()
outlier.close() 
print('game over')
summ=frame.groupby(['MessageID']).size()
print(summ.sort_values(ascending=False)) #降序排列，选出出现最多的告警

#或者直接如下更好，用字典的格式建立一个DataFrame,要获取的是每个key对应的列表。
import re
row1=[]
row2=[]
row3=[]
row4=[]
row5=[]
row6=[]
outfile=open('outfile.csv','w',encoding = 'utf-8')
outlier=open('outlier.csv','w',encoding = 'utf-8')
filter1='(.*?\d)\s+(CP[1-3][A-F])\s+([SJ]\S+)\s+(\S+[^\:])\s+(.*)' #最终版，把大部分的内容筛出来
for i in SystemDf.index:
    pattern=re.compile(filter1)
    result=pattern.search(SystemDf['SUMMARY'][i])
    if result:
        FIRSTOCCURRENCE=SystemDf2['FIRSTOCCURRENCE'][i]
        Time= result.group(1)
        LPAR= result.group(2)
        ID=result.group(3)
        MessageID=result.group(4)
        Message=result.group(5)
        row1.append(FIRSTOCCURRENCE)
        row2.append(Time)
        row3.append(LPAR)
        row4.append(ID)
        row5.append(MessageID)
        row6.append(Message)
    else:
        outlier.write(SystemDf['SUMMARY'][i])
        outlier.write(u'\n')
frame1=DataFrame({'FIRSTOCCURRENCE':row1,'TIME':row2,'LPAR':row3,'ID':row4,'MessageID':row5,'Message':row6})
frame1.to_csv(outfile,index=False,columns=['FIRSTOCCURRENCE','TIME','LPAR','ID','MessageID','Message'])

outfile.close()
outlier.close() 
print('game over')
summ=frame1.groupby(['MessageID']).size()
print(summ.sort_values(ascending=False)) #降序排列，选出出现最多的告警

结果如下

 game over
MessageID
+DBA:BANCS     768
BNROUTACTS0    645
*IEF099I       548
+DBA:EISS      215
DNFO1777E      144
BNEEPUACTS0    144
+CSQX209E      108
DNFO1709E      103
DNFF4141I      100
...

4. 使用透视表获得聚合后的数据结果

row1=[]
row2=[]
row3=[]
filter2='PART\s(\d+)\s(.*?)\s'
pattern=re.compile(filter2)
filter3='(\d+)\-(\d+)\-(\d+)\s'
pattern2=re.compile(filter3)
for i in DBAWarning.index:
    DBAMess=pattern.search(DBAWarning['Message'][i])
    DBATime=pattern2.search(str(DBAWarning['FIRSTOCCURRENCE'][i]))
    if DBAMess:
        row1.append(DBAMess.group(1))
        #将字符串格式的使用率（如80%）转化成数字格式，如果是字符串的话无法使用pivot_table
        usage=float(DBAMess.group(2).strip('%'))/100
        p_float=round(usage,3)
        #p_float=float('%.3f' % usage)
        #保留小数点后面3位
        row2.append(p_float)
        row3.append(DBATime.group(0))
frame2=DataFrame({'PARTID':row1,'USAGE':row2,'FIRSTOCCURRENCE':row3})
FrameResult=open('frame2.csv','w',encoding = 'utf-8')
frame2.to_csv(FrameResult,index=False)
FrameResult.close() 
StatisticDBA=pd.pivot_table(frame2,index=['PARTID','FIRSTOCCURRENCE'],values=['USAGE'])
Input=frame2.loc[frame2['PARTID'] == '159']
PartUsage=pd.pivot_table(Input,index=['PARTID','FIRSTOCCURRENCE'],values=['USAGE'])
print(PartUsage)

得到结果

                        USAGE
PARTID FIRSTOCCURRENCE        
159    2018-09-15       0.7825
       2018-09-16       0.7880
       2018-09-17       0.7940
       2018-11-26       0.7000
       2018-11-27       0.7045
       2018-11-28       0.7115
       2018-11-29       0.7185
       2018-11-30       0.7220

DataFrame练习

1. 如何读取更换目录并读取目录下的文件

2. 如何读取excel中的特定列特定值

3. 如何从原excel中抽取有用信息并存成新的excel

4. 使用透视表获得聚合后的数据结果

猜你喜欢

热点阅读