python提取pdf文本内容——解决只能逐行读取，得到段落

pdf段落分析：

使用pdfplumber包，将逐行读取的提取成段落

就一个思路：按首行缩进提取。也就是只提取首行缩进的。

如首行顶格的，另做考虑吧，先做首行缩进。

一个char的内容如下：

{
    "matrix": [
        0.05,
        0.0,
        0.0,
        0.05,
        122.04,
        601.01
    ],
    "fontname": "JDKKJW+KaiTi",
    "adv": 319.0,
    "upright": True,
    "x0": 122.04,
    "y0": 598.09115,
    "x1": 137.99,
    "y1": 614.04115,
    "width": 15.950000000000003,
    "height": 15.950000000000045,
    "size": 15.950000000000045,
    "mcid": None,
    "tag": None,
    "object_type": "char",
    "page_number": 1,
    "ncs": "DeviceRGB",
    "text": "第",
    "stroking_color": [
        0,
        0,
        0
    ],
    "stroking_pattern": None,
    "non_stroking_color": [
        0,
        0,
        0
    ],
    "non_stroking_pattern": None,
    "top": 227.85884999999996,
    "bottom": 243.80885,
    "doctop": 227.85884999999996
}

这是一个关于PDF中文本字符信息的解释：

'matrix': (0.05, 0.0, 0.0, 0.05, 122.04, 601.01) - 这个矩阵表示字符的变换矩阵，用于在PDF页面上定位字符的位置和大小。
'fontname': 'JDKKJW+KaiTi' - 这表示字符所使用的字体名称。
'adv': 319.0 - 这是字符的高度。
'upright': True - 表示字符是直立的。
'x0': 122.04, 'y0': 598.09115, 'x1': 137.99, 'y1': 614.04115 - 这些是字符边界框的坐标，表示字符所在的矩形区域。
'width': 15.950000000000003, 'height': 15.950000000000045, 'size': 15.950000000000045 - 这些是字符的宽度、高度和大小。
'mcid': None, 'tag': None - 这些是与PDF标记相关的信息，通常用于结构化PDF文档。
'object_type': 'char' - 表示这是一个字符对象。
'page_number': 1 - 表示字符所在的页面编号。
'ncs': 'DeviceRGB' - 表示字符的颜色空间。
'text': '第' - 这是字符的文本内容，这里是'第'字。
'stroking_color': (0, 0, 0), 'stroking_pattern': None, 'non_stroking_color': (0, 0, 0), 'non_stroking_pattern': None - 这些是字符的颜色信息。
'top': 227.85884999999996, 'bottom': 243.80885, 'doctop': 227.85884999999996 - 这些是字符所在的矩形区域的顶部和底部位置，以及相对于整个文档顶部的位置。

这些信息描述了PDF中特定字符的位置、大小、颜色和其他属性。

示例代码：

import re
import pdfplumber
from configs.config import *
def get_text_lines(file_path):
    """
    将pdf中的文本按行读取，并输出文本+首字符x轴位置信息的字典列表
    :param file_path: PDF文件路径
    :return:
    """
    min_x0 = min_width = 99999

    result = []
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            # 获取页面上的所有字符
            chars = page.chars
            # 根据字符的垂直位置对它们进行分组，以重新构建段落结构
            line = ""
            x0 = 0
            last_height = 0
            last_top = None
            index = 0
            while True:
                if index == len(chars):
                    break
                char = chars[index]
                if line == "":
                    # 如果line为空(上一行文字已添加完毕)，应开始添加char的位置信息
                    last_top = char["top"]
                    last_height = char["height"]
                    width = char["width"]
                    x0 = char["x0"]
                    if x0 < min_x0:
                        min_x0 = x0
                    if min_width > width:
                        min_width = width
                    text = char["text"]
                    line += text
                    index += 1
                else:
                    # 无需添加char的位置信息，补充line的文本内容
                    if char['top'] - last_top > HEIGHT_COEFFICIENT * last_height or last_top - char['top'] > HEIGHT_COEFFICIENT * last_height: # 假设新字符与前一个字符不在同一行
                        if not re.match('^—[0123456789]+—$', line):
                            result.append({"line": line, "x0": x0})
                        line = ""
                    else:
                        text = char["text"]
                        line += text
                        index += 1
            # 最后一行：
            if not re.match('^—[0123456789]+—$', line):
                result.append({"line": line,"x0": x0})
    paragraph_list = []
    indent = INDENT_COEFFICIENT * min_width
    paragraph = ""
    index = 0
    while True:
        if index+1 == len(result):
            # 最后一行了：
            if result[index]["x0"] - min_x0 > indent:
                # 最后一行是缩进的：
                paragraph_list.append(result[index]["line"])
            else:
                paragraph += result[index]["line"]
                paragraph_list.append(paragraph)
            break
        if result[index]["x0"] - min_x0 > indent and result[index+1]["x0"] - min_x0 > indent:
            # 如果该行是缩进的，下一行也缩进，那么认为是段落
            paragraph = result[index]["line"]
            paragraph_list.append(paragraph)
            paragraph = ""
        elif result[index]["x0"] - min_x0 > indent and result[index+1]["x0"] - min_x0 < indent:
            # 如果该行是缩进的，下一行不是，那么先不添加
            paragraph = result[index]["line"]
        elif result[index]["x0"] - min_x0 < indent and result[index+1]["x0"] - min_x0 > indent:
            # 本行无缩进，下一行缩进，认为段落结束，添加
            paragraph += result[index]["line"]
            paragraph_list.append(paragraph)
            paragraph = ""
        elif result[index]["x0"] - min_x0 < indent and result[index+1]["x0"] - min_x0 < indent:
            # 本行无缩进，下一行无缩进，认为段落未结束
            paragraph += result[index]["line"]
        index += 1
    return paragraph_list
if __name__ == '__main__':
    file_path = r'D:\workspace\驱动及脚本\风控规章制度文档结构化\农村集体经济组织财务制度.pdf'
    print(get_text_lines(file_path))

发表回复 取消回复

发表回复取消回复