[python]代码库
from zipfile import ZipFile
from urllib.request import urlopen
from io import BytesIO
from bs4 import BeautifulSoup
wordFile = urlopen("http://pythonscraping.com/pages/AWordDocument.docx").read()
wordFile = BytesIO(wordFile) # 转成二进制文件对象
document = ZipFile(wordFile) # 解压文件
xml_content = document.read('word/document.xml')
wordObj = BeautifulSoup(xml_content.decode('utf-8'),'html.parser')
textStrings = wordObj.findAll("w:t")
for textElem in textStrings:
closeTag = ""
try:
style = textElem.parent.previousSibling.find("w:pstyle") # 父标签的前一个
if style is not None and style["w:val"] == "Title": # 如果<w:pstyle w:val="Title"></w:pstyle>存在
print("<h1>")
closeTag = "</h1>"
except AttributeError:
#不打印标签
pass
print(textElem.text)
print(closeTag)
[代码运行效果截图]