pdfMaxSize

2022-01-06  本文已影响0人  月夜星空下
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar
import re


def pdfMaxSize(file_path):
    max_size = []
    for page_layout in extract_pages(file_path):
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                for text_line in element:
                    fontSizeMax = 0
                    for character in text_line:
                        if isinstance(character, LTChar) and (character.size > fontSizeMax and character.size >= 12):
                            fontSizeMax = character.size
                            max_size.append(fontSizeMax)
    return max(max_size)


def pdfTitle(file_path):
    featureData = []
    pageIndex = 1
    max_size = pdfMaxSize(file_path)
    for page_layout in extract_pages(file_path):
        if (pageIndex == 1):
            for element in page_layout:
                if isinstance(element, LTTextContainer):
                    for text_line in element:
                        text = text_line.get_text()
                        fontSizeMax = 0
                        for character in text_line:
                            if isinstance(character, LTChar) and (character.size > fontSizeMax and character.size >= max_size):
                                fontSizeMax = character.size

                        if (fontSizeMax > 0):
                            feature = [text.strip(), pageIndex, text_line.y0,fontSizeMax]
                            featureData.append(feature)
        pageIndex += 1
    feature_list = sorted(featureData, key=lambda featureData: (featureData[1], -int(featureData[2])))
    return feature_list


path2 = r'\\192.168.3.201\szse\annual_inquiry_letter\CDD00003738968HF.pdf'

if __name__ == '__main__':
    data = pdfTitle(path2)
    print(data)

上一篇下一篇

猜你喜欢

热点阅读