[python]代码库
from urllib.request import urlopen
from urllib.parse import urljoin
from re import findall, sub, S #S表示正则表达式中的.可以匹配换行符
from os.path import basename,isdir
from os import mkdir
url = r'网站地址'
root = 'XX新闻'
if not isdir(root):
mkdir(root)
while True:
with urlopen(url)as fp:
content = fp.read().decode()
#提取标题
pattern = r'<h1 .+?>(.+?)</h1>'
title = findall(pattern,content)[0]
title = sub(r'<.+?>| ','',title)
child =rf'{root}\{title}' #在root下创建title文件夹,加个r是不想斜线影响标题的第一个字符
if not isdir(child):
mkdir(child)
print(title)
#提取文本
pattern = r'<p class="MsoNormal".+?>(.+?)</p>'
with open(rf'{child}\{title}.txt','w',encoding='utf-8') as fp:
for item in findall(pattern,content, S):
item = sub(r'<.+?>| ','',item).strip()
if item:
fp.write(item+'\n')
#提取图片
parttern = r'<img width=.+?src="(.+?)"'
for item in findall(parttern,content):
item = urljoin(url,item)
with urlopen(item) as fp_web:
with open(rf'{child}\{basename(item)}','wb') as fp_local:
fp_local.write(fp_web.read())
#下一条新闻地址
pattern = r'下一条:<a href="(.+?)"'
next_url = findall(pattern,content)
if not next_url:
break
next_url = urljoin(url,next_url[0])
url = next_url