PDF年报|非年报识别

2021-05-20  本文已影响0人  月夜星空下
import re
import os
# import jieba
import pdfplumber

path = 'D:/Users/Desktop/test'


# jieba.load_userdict("./dict.txt")


def File_Eli(path):
    # 剔除隐藏的文件,需要提供被测文件的路径,生成一个剔除隐藏文件后的列表。
    path = os.listdir(path)
    ls = []
    for f in path:
        # print(f)
        if not f.startswith('.'):
            ls.append(f)
    return ls


def PdfPath(path):
    with pdfplumber.open(path) as p:
        page = p.pages[0]

        fpage = page.extract_text()
        content = re.sub('\s+', '', fpage).strip()
        # print(x)
        # print('###'*100)
    # paper = x.split()[0] + x.split()[1]
    # seg = jieba.lcut(fpage)
    return content


eli = File_Eli(path)


def Txt_Create(Target_Path, msg):
    # 新创建的txt文件的存放路径,需要提供url,生成文本及内容。msg是str。
    full_path = Target_Path + 'SoftLink.txt'  # 也可以创建一个.doc的word文档

    file = open(full_path, 'w')
    file.write(msg)  # msg也就是下面的Hello world
    file.close()
    return file


def ReadPdf(Dir, ls):
    # 提供path(路径)和ls(剔除隐藏文件的的列表),能打印出列表文本中的内容
    # print("#"+Path,ls)
    res = []
    all = []
    for i in range(len(ls)):
        url = Dir + '/' + ls[i]
        all.append(url)
        print(url)
        try:
            f = PdfPath(url)
            if '年度报告' in f:
                if '摘要' not in f:
                    if '半年' not in f:
                        res.append(f)
                        # print(f)
                        print(url)
        except:
            pass
    ret3 = list(set(res) - set(all))
    print('ret3:', ret3)
    return res


sss = ReadPdf(path, eli)
print(len(sss))

上一篇 下一篇

猜你喜欢

热点阅读