内容格式还原

2022-04-05  本文已影响0人  月夜星空下
import pdfplumber


def pdfRestore(path, is_footer=True):  # pdf还原
    """
    pdf排版格式还原
    :param path:
    :param is_footer:
    :return:
    """
    # is_footer = pdfContent(path)
    # print('is_footer:', is_footer)
    global result
    try:
        with pdfplumber.open(path) as pdf:
            content = ''
            for i in range(len(pdf.pages)):
                page = pdf.pages[i]
                if is_footer:
                    # print('page.extract_text:', page.extract_text().split('\n'))
                    page_content = '\n'.join(page.extract_text().split('\n')[:-1])
                    content = content + page_content
                else:
                    page_content = '\n'.join(page.extract_text().split('\n'))
                    content = content + page_content
            # print('$' * 100)
        """数据处理"""
        result = content.replace(' \n', '$$')
        result = ' '.join(result.split())
        result = result.replace(' ', '##')  # 原始
        result = result.replace('$$', ' \n')
        result = ' '.join(result.split())
        result = result.replace(' ', '\n')
        result = result.replace('##', ' ')
        result = result.replace('\n ', ' ')
    except Exception as e:
        print('pdfRestore erro:', e)
    return result

path = 'D:\Downloads\创业板首次公开发行证券发行与承销特别规定(2021年修订).pdf'
data = pdfRestore(path)
print(data)
上一篇 下一篇

猜你喜欢

热点阅读