from zipfile import ZipFile |
from urllib.request import urlopen |
from io import BytesIO |
from bs4 import BeautifulSoup |
wordFile = urlopen( "http://pythonscraping.com/pages/AWordDocument.docx" ).read() |
wordFile = BytesIO(wordFile) # 转成二进制文件对象 |
document = ZipFile(wordFile) # 解压文件 |
xml_content = document.read( 'word/document.xml' ) |
wordObj = BeautifulSoup(xml_content.decode( 'utf-8' ), 'html.parser' ) |
textStrings = wordObj.findAll( "w:t" ) |
for textElem in textStrings: |
closeTag = "" |
try : |
style = textElem.parent.previousSibling.find( "w:pstyle" ) # 父标签的前一个 |
if style is not None and style[ "w:val" ] = = "Title" : # 如果<w:pstyle w:val="Title"></w:pstyle>存在 |
print ( "<h1>" ) |
closeTag = "</h1>" |
except AttributeError: |
#不打印标签 |
pass |
print (textElem.text) |
print (closeTag) |