PDF文件的读写

2020-04-05 本文已影响0人 autumn1919
一个简短的Python脚本，实现对PDF文档的读写
import pdfplumber
import pandas as pd
import os
import csv

#在硬盘下新建名为data_pdf的文件夹，放入PDF文件执行即可
with open('pdf_read.csv','a+',newline='',encoding='utf-8') as csvfile:
    writer=csv.writer(csvfile,dialect='excel')
    for root, dirs, files in os.walk("./data_pdf/" ):
        for index_file,name in enumerate(files):
            try:
                if name.split('.')[1]=='pdf':
                    path=os.path.join(root, name)
                    with pdfplumber.open(path) as pdf:
                        for index,page in enumerate(pdf.pages):
                            #tables = page.extract_tables()  # 获取表格信息
                            string = page.extract_text()  # 获取PDF文本信息
                            if 'a'  in string or 'b' in string or 'c' in string or 'd' in string or 'e' in string or 'f' in string:
                                print("{}/{},{}在第{}页存在关键词，处理完毕".format(index_file+1,len(files),name,index+1))
                                count=1
                                state='T'
                                writer.writerow([name,count,index+1,state])
                                break
                            else:
                                print("{}/{},{}的第{}页正在处理".format(index_file+1,len(files),name,index+1))
            except:
                state='F'
                writer.writerow([name,state])
PDF文件的读写

猜你喜欢

热点阅读