from urllib.request import urlopen |
from pdfminer.pdfinterp import PDFResourceManager,process_pdf |
from pdfminer.converter import TextConverter |
from pdfminer.layout import LAParams |
from io import StringIO |
from io import open |
def readPDF(pdfFile): |
rsrcmgr = PDFResourceManager() # pdf资源管理对象 |
retstr = StringIO() # StringIO 模块用于在内存缓冲区中读写数据 |
laparams = LAParams() # 创建一个PDF设备对象 |
device = TextConverter(rsrcmgr,retstr,laparams = laparams) # TextConverter文本转换器 |
process_pdf(rsrcmgr,device,pdfFile) # (资源管理对象,转换器,url链接) |
device.close() |
content = retstr.getvalue() # getvalue返回对象retstr中的所有数据 |
retstr.close() |
return content |
pdfFile = urlopen( 'http://pythonscraping.com/pages/warandpeace/chapter1.pdf' ) |
outputString = readPDF(pdfFile) |
print (outputString) |
pdfFile.close() |