DataFrame练习
2018-12-11 本文已影响0人
dechuan
最近用python写了个小程序,顺便回顾一下语法,做个记录。
1. 如何读取更换目录并读取目录下的文件
import os
print(os.getcwd()) # 打印当前工作目录
os.chdir('D:\warning data') # 将当前工作目录改变为`D:\warning data`
print(os.listdir())
df = pd.read_excel('alarm20180915-1130.xls')
2. 如何读取excel中的特定列特定值
SystemDf=df['FIRSTOCCURRENCE']
- 得到结果
0 2018-09-15 00:00:49
1 2018-09-15 00:39:05
2 2018-09-15 00:57:22
3 2018-09-15 01:00:55
4 2018-09-15 01:05:41
5 2018-09-15 01:05:41
#带限制条件选择列
SystemDf=df.loc[(df['STRINGA']=='P1')|(df['STRINGA']=='P2')|(df['STRINGA']=='P3'),['SUMMARY','FIRSTOCCURRENCE']]
#选择一列中的每一行
for i in SystemDf.index:
print(SystemDf['FIRSTOCCURRENCE'][i])
#选择某一列中特定值所对应的信息
PartUsage=df.loc[frame2['PARTID'] == '159']
- 得到结果
USAGE
PARTID FIRSTOCCURRENCE
159 2018-09-15 0.7825
2018-09-16 0.7880
2018-09-17 0.7940
2018-11-26 0.7000
2018-11-27 0.7045
2018-11-28 0.7115
2018-11-29 0.7185
2018-11-30 0.7220
3. 如何从原excel中抽取有用信息并存成新的excel
#因为后期会有大量调用操作,建议将从原excel中抽取的信息存成字典格式,再使用to_csv将其保存为csv,得到列名为字典key值,每一列为value的符合excel读取方式又便于后期程序调用的文件
import re
diction={}
diction['FIRSTOCCURRENCE']=[]
diction['TIME']=[]
diction['LPAR']=[]
diction['ID']=[]
diction['MessageID']=[]
diction['Message']=[]
outfile=open('outfile.csv','w',encoding = 'utf-8')
outlier=open('outlier.csv','w',encoding = 'utf-8')
filter='(.*?\d)\s+(CP[1-3][A-F])\s+([SJ]\S+)\s+(\S+[^\:])\s+(.*)' #最终版,把大部分的内容筛出来
#outfile.write('TIME;LPAR;ID;MessageID;Message;')
#outfile.write(u'\n')
for i in SystemDf.index:
pattern=re.compile(filter)
result=pattern.search(SystemDf['SUMMARY'][i])
if result:
FIRSTOCCURRENCE=SystemDf2['FIRSTOCCURRENCE'][i]
Time= result.group(1)
LPAR= result.group(2)
ID=result.group(3)
MessageID=result.group(4)
Message=result.group(5)
diction['FIRSTOCCURRENCE'].append(FIRSTOCCURRENCE)
diction['TIME'].append(Time)
diction['LPAR'].append(LPAR)
diction['ID'].append(ID)
diction['MessageID'].append(MessageID)
diction['Message'].append(Message)
else:
outlier.write(SystemDf['SUMMARY'][i])
outlier.write(u'\n')
#print('TIME;LPAR;ID;MessageID;Message',file=outfile)
frame=DataFrame(diction)
frame.to_csv(outfile,index=False,columns=['FIRSTOCCURRENCE','TIME','LPAR','ID','MessageID','Message'])
outfile.close()
outlier.close()
print('game over')
summ=frame.groupby(['MessageID']).size()
print(summ.sort_values(ascending=False)) #降序排列,选出出现最多的告警
#或者直接如下更好,用字典的格式建立一个DataFrame,要获取的是每个key对应的列表。
import re
row1=[]
row2=[]
row3=[]
row4=[]
row5=[]
row6=[]
outfile=open('outfile.csv','w',encoding = 'utf-8')
outlier=open('outlier.csv','w',encoding = 'utf-8')
filter1='(.*?\d)\s+(CP[1-3][A-F])\s+([SJ]\S+)\s+(\S+[^\:])\s+(.*)' #最终版,把大部分的内容筛出来
for i in SystemDf.index:
pattern=re.compile(filter1)
result=pattern.search(SystemDf['SUMMARY'][i])
if result:
FIRSTOCCURRENCE=SystemDf2['FIRSTOCCURRENCE'][i]
Time= result.group(1)
LPAR= result.group(2)
ID=result.group(3)
MessageID=result.group(4)
Message=result.group(5)
row1.append(FIRSTOCCURRENCE)
row2.append(Time)
row3.append(LPAR)
row4.append(ID)
row5.append(MessageID)
row6.append(Message)
else:
outlier.write(SystemDf['SUMMARY'][i])
outlier.write(u'\n')
frame1=DataFrame({'FIRSTOCCURRENCE':row1,'TIME':row2,'LPAR':row3,'ID':row4,'MessageID':row5,'Message':row6})
frame1.to_csv(outfile,index=False,columns=['FIRSTOCCURRENCE','TIME','LPAR','ID','MessageID','Message'])
outfile.close()
outlier.close()
print('game over')
summ=frame1.groupby(['MessageID']).size()
print(summ.sort_values(ascending=False)) #降序排列,选出出现最多的告警
- 结果如下
game over
MessageID
+DBA:BANCS 768
BNROUTACTS0 645
*IEF099I 548
+DBA:EISS 215
DNFO1777E 144
BNEEPUACTS0 144
+CSQX209E 108
DNFO1709E 103
DNFF4141I 100
...
4. 使用透视表获得聚合后的数据结果
row1=[]
row2=[]
row3=[]
filter2='PART\s(\d+)\s(.*?)\s'
pattern=re.compile(filter2)
filter3='(\d+)\-(\d+)\-(\d+)\s'
pattern2=re.compile(filter3)
for i in DBAWarning.index:
DBAMess=pattern.search(DBAWarning['Message'][i])
DBATime=pattern2.search(str(DBAWarning['FIRSTOCCURRENCE'][i]))
if DBAMess:
row1.append(DBAMess.group(1))
#将字符串格式的使用率(如80%)转化成数字格式,如果是字符串的话无法使用pivot_table
usage=float(DBAMess.group(2).strip('%'))/100
p_float=round(usage,3)
#p_float=float('%.3f' % usage)
#保留小数点后面3位
row2.append(p_float)
row3.append(DBATime.group(0))
frame2=DataFrame({'PARTID':row1,'USAGE':row2,'FIRSTOCCURRENCE':row3})
FrameResult=open('frame2.csv','w',encoding = 'utf-8')
frame2.to_csv(FrameResult,index=False)
FrameResult.close()
StatisticDBA=pd.pivot_table(frame2,index=['PARTID','FIRSTOCCURRENCE'],values=['USAGE'])
Input=frame2.loc[frame2['PARTID'] == '159']
PartUsage=pd.pivot_table(Input,index=['PARTID','FIRSTOCCURRENCE'],values=['USAGE'])
print(PartUsage)
- 得到结果
USAGE
PARTID FIRSTOCCURRENCE
159 2018-09-15 0.7825
2018-09-16 0.7880
2018-09-17 0.7940
2018-11-26 0.7000
2018-11-27 0.7045
2018-11-28 0.7115
2018-11-29 0.7185
2018-11-30 0.7220