import pdfplumber defpdf2txt(input_file, output_file): """解析PDF到TEXT""" with pdfplumber.open(input_file) as fp_r, open(output_file, 'w') as fp_w: pages = fp_r.pages for ix, page inenumerate(pages): text = page.extract_text() fp_w.write(text)
方法二:
1 2 3
import os defpdf2txt(input_file, output_file): os.system(f'pdftotext -layout {input_file}{output_file}')
from pdf2image import convert_from_path import tempfile defpdf2img(filename, output_dir): with tempfile.TemporaryDirectory() as path: images = convert_from_path(filename) for index, img inenumerate(images): img.save(f'{output_dir}/page_{index}.png')